// -*- mode: c++ -*-

// Copyright (c) 2012-2013, 2015-2018, 2020, 2024-2025 Arm Limited
// All rights reserved
//
// The license below extends only to copyright in the software and shall
// not be construed as granting a license to any other intellectual
// property including but not limited to intellectual property relating
// to a hardware implementation of the functionality of the software
// licensed hereunder.  You may use the software subject to the license
// terms below provided that you ensure that this notice is replicated
// unmodified and in its entirety in all distributions of the software,
// modified or unmodified, in source code or in binary form.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met: redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer;
// redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution;
// neither the name of the copyright holders nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

let {{

    header_output = ""
    exec_output = ""
    decoders = { 'Generic' : {} }

    # FP types (FP operations always work with unsigned representations)
    floatTypes = ("uint16_t", "uint32_t", "uint64_t")
    smallFloatTypes = ("uint16_t","uint32_t")

    zeroSveVecRegUpperPartCode = '''
        ArmISA::ISA::zeroSveVecRegUpperPart(%s,
            ArmStaticInst::getCurSveVecLen<uint64_t>(xc->tcBase()));
    '''

    def threeEqualRegInstX(name, Name, opClass, types, rCount, op,
                           readDest=False, pairwise=False, scalar=False,
                           byElem=False, decoder='Generic', complex=False,
                           extra=''):
        assert (not pairwise) or ((not byElem) and (not scalar))
        global header_output, exec_output, decoders
        eWalkCode = simd64EnabledCheckCode + '''
        RegVect srcReg1, destReg;
        '''
        if byElem:
            # 2nd register operand has to be read fully
            eWalkCode += '''
        FullRegVect srcReg2;
        '''
        else:
            eWalkCode += '''
        RegVect srcReg2;
        '''
        for reg in range(rCount):
            eWalkCode += '''
        srcReg1.regs[%(reg)d] = htole(AA64FpOp1P%(reg)d_uw);
        srcReg2.regs[%(reg)d] = htole(AA64FpOp2P%(reg)d_uw);
        ''' % { "reg" : reg }
            if readDest:
                eWalkCode += '''
        destReg.regs[%(reg)d] = htole(AA64FpDestP%(reg)d_uw);
        ''' % { "reg" : reg }
        if byElem:
            # 2nd operand has to be read fully
            for reg in range(rCount, 4):
                eWalkCode += '''
        srcReg2.regs[%(reg)d] = htole(AA64FpOp2P%(reg)d_uw);
        ''' % { "reg" : reg }
        readDestCode = ''
        if readDest:
            readDestCode = 'destElem = letoh(destReg.elements[i]);'

        if complex:
            eWalkCode += op
        elif pairwise:
            eWalkCode += '''
        for (unsigned i = 0; i < eCount; i++) {
            Element srcElem1 = letoh(2 * i < eCount ?
                                     srcReg1.elements[2 * i] :
                                     srcReg2.elements[2 * i - eCount]);
            Element srcElem2 = letoh(2 * i < eCount ?
                                     srcReg1.elements[2 * i + 1] :
                                     srcReg2.elements[2 * i + 1 - eCount]);
            Element destElem;
            %(readDest)s
            %(op)s
            destReg.elements[i] = htole(destElem);
        }
        ''' % { "op" : op, "readDest" : readDestCode }
        else:
            scalarCheck = '''
            if (i != 0) {
                destReg.elements[i] = 0;
                continue;
            }
            '''
            eWalkCode += extra
            eWalkCode += '''
        for (unsigned i = 0; i < eCount; i++) {
            %(scalarCheck)s
            Element srcElem1 = letoh(srcReg1.elements[i]);
            Element srcElem2 = letoh(srcReg2.elements[%(src2Index)s]);
            Element destElem;
            %(readDest)s
            %(op)s
            destReg.elements[i] = htole(destElem);
        }
        ''' % { "op" : op, "readDest" : readDestCode,
                "scalarCheck" : scalarCheck if scalar else "",
                "src2Index" : "imm" if byElem else "i" }
        for reg in range(rCount):
            eWalkCode += '''
        AA64FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
        ''' % { "reg" : reg }
        if rCount < 4:  # zero upper half
            for reg in range(rCount, 4):
                eWalkCode += '''
        AA64FpDestP%(reg)d_uw = 0;
        ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name,
                               "DataX2RegImmOp" if byElem else "DataX2RegOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "op_class": opClass }, [])
        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
        if byElem:
            header_output += NeonX2RegImmOpDeclare.subst(iop)
        else:
            header_output += NeonX2RegOpDeclare.subst(iop)
        exec_output += NeonXEqualRegOpExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonXExecDeclare.subst(substDict)

    def threeUnequalRegInstX(name, Name, opClass, types, op,
                             bigSrc1, bigSrc2, bigDest, readDest, scalar=False,
                             byElem=False, hi=False, short=False,
                             extra_check=''):
        assert not (scalar and hi)
        global header_output, exec_output
        src1Cnt = src2Cnt = destCnt = 1 if short else 2
        src1Prefix = src2Prefix = destPrefix = ''
        if bigSrc1:
            src1Cnt = 2 if short else 4
            src1Prefix = 'Big'
        if bigSrc2:
            src2Cnt = 2 if short else 4
            src2Prefix = 'Big'
        if bigDest:
            destCnt = 2 if short else 4
            destPrefix = 'Big'
        if byElem:
            src2Prefix = 'Full'
        eWalkCode = simd64EnabledCheckCode + extra_check + '''
        %sRegVect srcReg1;
        %sRegVect srcReg2;
        %sRegVect destReg;
        ''' % (src1Prefix, src2Prefix, destPrefix)
        srcReg1 = 0
        if hi and not bigSrc1:  # long/widening operations
            srcReg1 = 1 if short else 2
        for reg in range(src1Cnt):
            eWalkCode += '''
        srcReg1.regs[%(reg)d] = htole(AA64FpOp1P%(srcReg1)d_uw);
        ''' % { "reg" : reg, "srcReg1" : srcReg1 }
            srcReg1 += 1
        srcReg2 = 0
        if (not byElem) and (hi and not bigSrc2):  # long/widening operations
            srcReg2 = 1 if short else 2
        for reg in range(src2Cnt):
            eWalkCode += '''
        srcReg2.regs[%(reg)d] = htole(AA64FpOp2P%(srcReg2)d_uw);
        ''' % { "reg" : reg, "srcReg2" : srcReg2 }
            srcReg2 += 1
        if byElem:
            # 2nd operand has to be read fully
            for reg in range(src2Cnt, 4):
                eWalkCode += '''
        srcReg2.regs[%(reg)d] = htole(AA64FpOp2P%(reg)d_uw);
        ''' % { "reg" : reg }
        if readDest:
            for reg in range(destCnt):
                eWalkCode += '''
        destReg.regs[%(reg)d] = htole(AA64FpDestP%(reg)d_uw);
        ''' % { "reg" : reg }
        readDestCode = ''
        if readDest:
            readDestCode = 'destElem = letoh(destReg.elements[i]);'
        scalarCheck = '''
            if (i != 0) {
                destReg.elements[i] = 0;
                continue;
            }
            '''
        eWalkCode += '''
        for (unsigned i = 0; i < eCount; i++) {
            %(scalarCheck)s
            %(src1Prefix)sElement srcElem1 = letoh(srcReg1.elements[i]);
            %(src1Prefix)sElement srcElem2 =
                letoh(srcReg2.elements[%(src2Index)s]);
            %(destPrefix)sElement destElem;
            %(readDest)s
            %(op)s
            destReg.elements[i] = htole(destElem);
        }
        ''' % { "op" : op, "readDest" : readDestCode,
                "src1Prefix" : src1Prefix, "src2Prefix" : src2Prefix,
                "destPrefix" : destPrefix,
                "scalarCheck" : scalarCheck if scalar else "",
                "src2Index" : "imm" if byElem else "i" }
        destReg = 0
        if hi and not bigDest:
            # narrowing operations
            destReg = 1 if short else 2
        for reg in range(destCnt):
            eWalkCode += '''
        AA64FpDestP%(destReg)d_uw = letoh(destReg.regs[%(reg)d]);
        ''' % { "reg" : reg, "destReg": destReg }
            destReg += 1
        if destCnt < 4:
            if hi and not bigDest:  # Explicitly merge with lower half
                for reg in range(0, destCnt):
                    eWalkCode += '''
        AA64FpDestP%(reg)d_uw = AA64FpDestP%(reg)d_uw;''' % { "reg" : reg }
            else:  # zero upper half
                for reg in range(destCnt, 4):
                    eWalkCode += '''
        AA64FpDestP%(reg)d_uw = 0;''' % { "reg" : reg }

        iop = ArmInstObjParams(name, Name,
                               "DataX2RegImmOp" if byElem else "DataX2RegOp",
                               { "code": eWalkCode,
                                 "r_count": 1 if short else 2,
                                 "op_class": opClass }, [])
        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
        if byElem:
            header_output += NeonX2RegImmOpDeclare.subst(iop)
        else:
            header_output += NeonX2RegOpDeclare.subst(iop)
        exec_output += NeonXUnequalRegOpExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonXExecDeclare.subst(substDict)

    def threeRegNarrowInstX(name, Name, opClass, types, op, readDest=False,
                            scalar=False, byElem=False, hi=False,
                            extra_check=''):
        assert not byElem
        threeUnequalRegInstX(name, Name, opClass, types, op,
                             True, True, False, readDest, scalar, byElem, hi,
                             False, extra_check)

    def threeRegLongInstX(name, Name, opClass, types, op, readDest=False,
                          scalar=False, byElem=False, hi=False, short=False,
                          extra_check=''):
        threeUnequalRegInstX(name, Name, opClass, types, op,
                             False, False, True, readDest, scalar, byElem, hi,
                             short, extra_check)

    def threeRegWideInstX(name, Name, opClass, types, op, readDest=False,
                          scalar=False, byElem=False, hi=False,
                          extra_check=''):
        assert not byElem
        threeUnequalRegInstX(name, Name, opClass, types, op,
                             True, False, True, readDest, scalar, byElem, hi,
                             False, extra_check)

    def twoEqualRegInstX(name, Name, opClass, types, rCount, op,
                         readDest=False, scalar=False, byElem=False,
                         hasImm=False, isDup=False, extra_check=''):
        global header_output, exec_output
        assert (not isDup) or byElem
        if byElem:
            hasImm = True
        if isDup:
            eWalkCode = simd64EnabledCheckCode + extra_check + '''
        FullRegVect srcReg1;
        RegVect destReg;
        '''
        else:
            eWalkCode = simd64EnabledCheckCode + extra_check + '''
        RegVect srcReg1, destReg;
        '''
        for reg in range(4 if isDup else rCount):
            eWalkCode += '''
        srcReg1.regs[%(reg)d] = htole(AA64FpOp1P%(reg)d_uw);
        ''' % { "reg" : reg }
            if readDest:
                eWalkCode += '''
        destReg.regs[%(reg)d] = htole(AA64FpDestP%(reg)d_uw);
        ''' % { "reg" : reg }
        readDestCode = ''
        if readDest:
            readDestCode = 'destElem = letoh(destReg.elements[i]);'
        scalarCheck = '''
            if (i != 0) {
                destReg.elements[i] = 0;
                continue;
            }
            '''
        eWalkCode += '''
        for (unsigned i = 0; i < eCount; i++) {
            %(scalarCheck)s
            unsigned j = i;
            Element srcElem1 = letoh(srcReg1.elements[%(src1Index)s]);
            Element destElem;
            %(readDest)s
            %(op)s
            destReg.elements[j] = htole(destElem);
        }
        ''' % { "op" : op, "readDest" : readDestCode,
                "scalarCheck" : scalarCheck if scalar else "",
                "src1Index" : "imm" if byElem else "i" }
        for reg in range(rCount):
            eWalkCode += '''
        AA64FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
        ''' % { "reg" : reg }
        if rCount < 4:  # zero upper half
            for reg in range(rCount, 4):
                eWalkCode += '''
        AA64FpDestP%(reg)d_uw = 0;
        ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name,
                               "DataX1RegImmOp" if hasImm else "DataX1RegOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "op_class": opClass }, [])
        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
        if hasImm:
            header_output += NeonX1RegImmOpDeclare.subst(iop)
        else:
            header_output += NeonX1RegOpDeclare.subst(iop)
        exec_output += NeonXEqualRegOpExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonXExecDeclare.subst(substDict)

    def twoRegLongInstX(name, Name, opClass, types, op, readDest=False,
                        hi=False, hasImm=False, short=False):
        global header_output, exec_output
        eWalkCode = simd64EnabledCheckCode + '''
        RegVect srcReg1;
        BigRegVect destReg = {};
        '''
        destReg = 0 if not hi else 1 if short else 2
        for reg in range(1 if short else 2):
            eWalkCode += '''
        srcReg1.regs[%(reg)d] = htole(AA64FpOp1P%(destReg)d_uw);
        ''' % { "reg" : reg, "destReg": destReg }
            destReg += 1
        destReg = 0 if not hi else 1 if short else 2
        if readDest:
            for reg in range(2 if short else 4):
                eWalkCode += '''
        destReg.regs[%(reg)d] = htole(AA64FpDestP%(reg)d_uw);
        ''' % { "reg" : reg }
                destReg += 1
        readDestCode = ''
        if readDest:
            readDestCode = 'destReg = letoh(destReg.elements[i]);'
        eWalkCode += '''
        for (unsigned i = 0; i < eCount; i++) {
            Element srcElem1 = letoh(srcReg1.elements[i]);
            BigElement destElem;
            %(readDest)s
            %(op)s
            destReg.elements[i] = htole(destElem);
        }
        ''' % { "op" : op, "readDest" : readDestCode }
        for reg in range(2 if short else 4):
            eWalkCode += '''
        AA64FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
        ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name,
                               "DataX1RegImmOp" if hasImm else "DataX1RegOp",
                               { "code": eWalkCode,
                                 "r_count": 1 if short else 2,
                                 "op_class": opClass }, [])
        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
        if hasImm:
            header_output += NeonX1RegImmOpDeclare.subst(iop)
        else:
            header_output += NeonX1RegOpDeclare.subst(iop)
        exec_output += NeonXUnequalRegOpExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonXExecDeclare.subst(substDict)

    def twoRegNarrowInstX(name, Name, opClass, types, op, readDest=False,
                          scalar=False, hi=False, hasImm=False):
        global header_output, exec_output
        eWalkCode = simd64EnabledCheckCode + '''
        BigRegVect srcReg1;
        RegVect destReg;
        '''
        for reg in range(4):
            eWalkCode += '''
        srcReg1.regs[%(reg)d] = htole(AA64FpOp1P%(reg)d_uw);
        ''' % { "reg" : reg }
        if readDest:
            for reg in range(2):
                eWalkCode += '''
        destReg.regs[%(reg)d] = htole(AA64FpDestP%(reg)d_uw);
        ''' % { "reg" : reg }
        else:
            eWalkCode += '''
        destReg.elements[0] = 0;
        ''' % { "reg" : reg }
        readDestCode = ''
        if readDest:
            readDestCode = 'destElem = letoh(destReg.elements[i]);'
        scalarCheck = '''
            if (i != 0) {
                destReg.elements[i] = 0;
                continue;
            }
            '''
        eWalkCode += '''
        for (unsigned i = 0; i < eCount; i++) {
            %(scalarCheck)s
            BigElement srcElem1 = letoh(srcReg1.elements[i]);
            Element destElem;
            %(readDest)s
            %(op)s
            destReg.elements[i] = htole(destElem);
        }
        ''' % { "op" : op, "readDest" : readDestCode,
                "scalarCheck" : scalarCheck if scalar else "" }
        destReg = 0 if not hi else 2
        for reg in range(2):
            eWalkCode += '''
        AA64FpDestP%(destReg)d_uw = letoh(destReg.regs[%(reg)d]);
        ''' % { "reg" : reg, "destReg": destReg }
            destReg += 1
        if hi:
            for reg in range(0, 2):  # Explicitly merge with the lower half
                eWalkCode += '''
        AA64FpDestP%(reg)d_uw = AA64FpDestP%(reg)d_uw;''' % { "reg" : reg }
        else:
            for reg in range(2, 4):  # zero upper half
                eWalkCode += '''
        AA64FpDestP%(reg)d_uw = 0;
        ''' % { "reg" : reg }

        iop = ArmInstObjParams(name, Name,
                               "DataX1RegImmOp" if hasImm else "DataX1RegOp",
                               { "code": eWalkCode,
                                 "r_count": 2,
                                 "op_class": opClass }, [])
        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
        if hasImm:
            header_output += NeonX1RegImmOpDeclare.subst(iop)
        else:
            header_output += NeonX1RegOpDeclare.subst(iop)
        exec_output += NeonXUnequalRegOpExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonXExecDeclare.subst(substDict)

    def threeRegScrambleInstX(name, Name, opClass, types, rCount, op):
        global header_output, exec_output
        eWalkCode = simd64EnabledCheckCode + '''
        RegVect srcReg1, srcReg2, destReg;
        '''
        for reg in range(rCount):
            eWalkCode += '''
        srcReg1.regs[%(reg)d] = htole(AA64FpOp1P%(reg)d_uw);
        srcReg2.regs[%(reg)d] = htole(AA64FpOp2P%(reg)d_uw);
        ''' % { "reg" : reg }
        eWalkCode += op
        for reg in range(rCount):
            eWalkCode += '''
        AA64FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
        ''' % { "reg" : reg }
        if rCount < 4:
            for reg in range(rCount, 4):
                eWalkCode += '''
        AA64FpDestP%(reg)d_uw = 0;
        ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name, "DataX2RegOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "op_class": opClass }, [])
        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
        header_output += NeonX2RegOpDeclare.subst(iop)
        exec_output += NeonXEqualRegOpExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonXExecDeclare.subst(substDict)

    def insFromVecElemInstX(name, Name, opClass, types, rCount):
        global header_output, exec_output
        eWalkCode = simd64EnabledCheckCode + '''
        FullRegVect srcReg1;
        RegVect destReg;
        '''
        for reg in range(4):
            eWalkCode += '''
        srcReg1.regs[%(reg)d] = htole(AA64FpOp1P%(reg)d_uw);
        ''' % { "reg" : reg }
        for reg in range(rCount):
            eWalkCode += '''
        destReg.regs[%(reg)d] = htole(AA64FpDestP%(reg)d_uw);
        ''' % { "reg" : reg }
        eWalkCode += '''
        Element srcElem1 = letoh(srcReg1.elements[imm2]);
        Element destElem = srcElem1;
        destReg.elements[imm1] = htole(destElem);
        '''
        for reg in range(rCount):
            eWalkCode += '''
        AA64FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
        ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name, "DataX1Reg2ImmOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "op_class": opClass }, [])
        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
        header_output += NeonX1Reg2ImmOpDeclare.subst(iop)
        exec_output += NeonXEqualRegOpExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonXExecDeclare.subst(substDict)

    def twoRegPairwiseScInstX(name, Name, opClass, types, rCount, op):
        global header_output, exec_output
        eWalkCode = simd64EnabledCheckCode + '''
        RegVect srcReg1, destReg;
        '''
        for reg in range(rCount):
            eWalkCode += '''
        srcReg1.regs[%(reg)d] = htole(AA64FpOp1P%(reg)d_uw);
        ''' % { "reg" : reg }
        eWalkCode += '''
        Element srcElem1 = letoh(srcReg1.elements[0]);
        Element srcElem2 = letoh(srcReg1.elements[1]);
        Element destElem;
        %(op)s
        destReg.elements[0] = htole(destElem);
        ''' % { "op" : op }

        if rCount == 1:
            # This is the FP16 case
            destCnt = rCount
        else:
            destCnt = rCount // 2

        for reg in range(destCnt):
            eWalkCode += '''
        AA64FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
        ''' % { "reg" : reg }
        for reg in range(destCnt, 4):  # zero upper half
            eWalkCode += '''
        AA64FpDestP%(reg)d_uw = 0;
        ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name, "DataX1RegOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "op_class": opClass }, [])
        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
        header_output += NeonX1RegOpDeclare.subst(iop)
        exec_output += NeonXEqualRegOpExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonXExecDeclare.subst(substDict)

    def twoRegAcrossInstX(name, Name, opClass, types, rCount, op,
                          doubleDest=False, long=False, recursive=False):
        global header_output, exec_output
        destPrefix = "Big" if long else ""
        eWalkCode = simd64EnabledCheckCode + '''
        RegVect srcReg1;
        %sRegVect destReg;
        ''' % destPrefix
        for reg in range(rCount):
            eWalkCode += '''
        srcReg1.regs[%(reg)d] = htole(AA64FpOp1P%(reg)d_uw);
        ''' % { "reg" : reg }
        if recursive:
            eWalkCode += '''
        RegVect tmpReg = srcReg1;
        destReg.regs[0] = 0;
        for (unsigned gap = 1; gap < eCount; gap = gap * 2) {
          for (unsigned i = 0; i < eCount; i = i + gap * 2) {
            unsigned src_id0 = i;
            unsigned src_id1 = i + gap;
            unsigned dst_id = i;
            %(destPrefix)sElement destElem = letoh(tmpReg.elements[src_id0]);
            %(destPrefix)sElement srcElem1 = letoh(tmpReg.elements[src_id1]);
            %(op)s
            tmpReg.elements[dst_id] = destElem;
          }
        }
        destReg.elements[0] = htole(tmpReg.elements[0]);
        ''' % { "op" : op, "destPrefix" : destPrefix }
        else:
            eWalkCode += '''
        destReg.regs[0] = 0;
        %(destPrefix)sElement destElem = 0;
        for (unsigned i = 0; i < eCount; i++) {
            Element srcElem1 = letoh(srcReg1.elements[i]);
            if (i == 0) {
                destElem = srcElem1;
            } else {
                %(op)s
            }
        }
        destReg.elements[0] = htole(destElem);
        ''' % { "op" : op, "destPrefix" : destPrefix }
        destCnt = 2 if doubleDest else 1
        for reg in range(destCnt):
            eWalkCode += '''
        AA64FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
        ''' % { "reg" : reg }
        for reg in range(destCnt, 4):  # zero upper half
            eWalkCode += '''
        AA64FpDestP%(reg)d_uw = 0;
        ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name, "DataX1RegOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "op_class": opClass }, [])
        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
        header_output += NeonX1RegOpDeclare.subst(iop)
        if long:
            exec_output += NeonXUnequalRegOpExecute.subst(iop)
        else:
            exec_output += NeonXEqualRegOpExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonXExecDeclare.subst(substDict)

    def twoRegCondenseInstX(name, Name, opClass, types, rCount, op,
                            readDest=False):
        global header_output, exec_output
        eWalkCode = simd64EnabledCheckCode + '''
        RegVect srcRegs;
        BigRegVect destReg = {};
        '''
        for reg in range(rCount):
            eWalkCode += '''
        srcRegs.regs[%(reg)d] = htole(AA64FpOp1P%(reg)d_uw);
        ''' % { "reg" : reg }
            if readDest:
                eWalkCode += '''
        destReg.regs[%(reg)d] = htole(AA64FpDestP%(reg)d_uw);
        ''' % { "reg" : reg }
        readDestCode = ''
        if readDest:
            readDestCode = 'destElem = letoh(destReg.elements[i]);'
        eWalkCode += '''
        for (unsigned i = 0; i < eCount / 2; i++) {
            Element srcElem1 = letoh(srcRegs.elements[2 * i]);
            Element srcElem2 = letoh(srcRegs.elements[2 * i + 1]);
            BigElement destElem;
            %(readDest)s
            %(op)s
            destReg.elements[i] = htole(destElem);
        }
        ''' % { "op" : op, "readDest" : readDestCode }
        for reg in range(rCount):
            eWalkCode += '''
        AA64FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
        ''' % { "reg" : reg }
        if rCount < 4:  # zero upper half
            for reg in range(rCount, 4):
                eWalkCode += '''
        AA64FpDestP%(reg)d_uw = 0;
        ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name, "DataX1RegOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "op_class": opClass }, [])
        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
        header_output += NeonX1RegOpDeclare.subst(iop)
        exec_output += NeonXUnequalRegOpExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonXExecDeclare.subst(substDict)

    def oneRegImmInstX(name, Name, opClass, types, rCount, op, readDest=False):
        global header_output, exec_output
        eWalkCode = simd64EnabledCheckCode + '''
        RegVect destReg;
        '''
        if readDest:
            for reg in range(rCount):
                eWalkCode += '''
        destReg.regs[%(reg)d] = htole(AA64FpDestP%(reg)d_uw);
        ''' % { "reg" : reg }
        readDestCode = ''
        if readDest:
            readDestCode = 'destElem = letoh(destReg.elements[i]);'
        eWalkCode += '''
        for (unsigned i = 0; i < eCount; i++) {
            Element destElem;
            %(readDest)s
            %(op)s
            destReg.elements[i] = htole(destElem);
        }
        ''' % { "op" : op, "readDest" : readDestCode }
        for reg in range(rCount):
            eWalkCode += '''
        AA64FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
        ''' % { "reg" : reg }
        if rCount < 4:  # zero upper half
            for reg in range(rCount, 4):
                eWalkCode += '''
        AA64FpDestP%(reg)d_uw = 0;
        ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name, "DataXImmOnlyOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "op_class": opClass }, [])
        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
        header_output += NeonX1RegImmOnlyOpDeclare.subst(iop)
        exec_output += NeonXEqualRegOpExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonXExecDeclare.subst(substDict)

    def dupGprInstX(name, Name, opClass, types, rCount, gprSpec):
        global header_output, exec_output
        eWalkCode = simd64EnabledCheckCode + '''
        RegVect destReg;
        for (unsigned i = 0; i < eCount; i++) {
            destReg.elements[i] = htole((Element) %sOp1);
        }
        ''' % gprSpec
        for reg in range(rCount):
            eWalkCode += '''
        AA64FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
        ''' % { "reg" : reg }
        if rCount < 4:  # zero upper half
            for reg in range(rCount, 4):
                eWalkCode += '''
        AA64FpDestP%(reg)d_uw = 0;
        ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name, "DataX1RegOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "op_class": opClass }, [])
        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
        header_output += NeonX1RegOpDeclare.subst(iop)
        exec_output += NeonXEqualRegOpExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonXExecDeclare.subst(substDict)

    def extInstX(name, Name, opClass, types, rCount, op):
        global header_output, exec_output
        eWalkCode = simd64EnabledCheckCode + '''
        RegVect srcReg1, srcReg2, destReg;
        '''
        for reg in range(rCount):
            eWalkCode += '''
        srcReg1.regs[%(reg)d] = htole(AA64FpOp1P%(reg)d_uw);
        srcReg2.regs[%(reg)d] = htole(AA64FpOp2P%(reg)d_uw);
        ''' % { "reg" : reg }
        eWalkCode += op
        for reg in range(rCount):
            eWalkCode += '''
        AA64FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
        ''' % { "reg" : reg }
        if rCount < 4:  # zero upper half
            for reg in range(rCount, 4):
                eWalkCode += '''
        AA64FpDestP%(reg)d_uw = 0;
        ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name, "DataX2RegImmOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "op_class": opClass }, [])
        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
        header_output += NeonX2RegImmOpDeclare.subst(iop)
        exec_output += NeonXEqualRegOpExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonXExecDeclare.subst(substDict)

    def insFromGprInstX(name, Name, opClass, types, rCount, gprSpec):
        global header_output, exec_output
        eWalkCode = simd64EnabledCheckCode + '''
        RegVect destReg;
        '''
        for reg in range(rCount):
            eWalkCode += '''
        destReg.regs[%(reg)d] = htole(AA64FpDestP%(reg)d_uw);
        ''' % { "reg" : reg }
        eWalkCode += '''
        destReg.elements[imm] = htole((Element) %sOp1);
        ''' % gprSpec
        for reg in range(rCount):
            eWalkCode += '''
        AA64FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
        ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name, "DataX1RegImmOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "op_class": opClass }, [])
        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
        header_output += NeonX1RegImmOpDeclare.subst(iop)
        exec_output += NeonXEqualRegOpExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonXExecDeclare.subst(substDict)

    def insToGprInstX(name, Name, opClass, types, rCount, gprSpec,
                      signExt=False):
        global header_output, exec_output
        eWalkCode = simd64EnabledCheckCode + '''
        FullRegVect srcReg;
        '''
        for reg in range(4):
            eWalkCode += '''
        srcReg.regs[%(reg)d] = htole(AA64FpOp1P%(reg)d_uw);
        ''' % { "reg" : reg }
        if signExt:
            eWalkCode += '''
        %sDest = sext<sizeof(Element) * 8>(srcReg.elements[imm]);
        ''' % gprSpec
        else:
            eWalkCode += '''
        %sDest = srcReg.elements[imm];
        ''' % gprSpec
        iop = ArmInstObjParams(name, Name, "DataX1RegImmOp",
                               { "code": eWalkCode,
                                 "r_count": rCount,
                                 "op_class": opClass }, [])
        header_output += NeonX1RegImmOpDeclare.subst(iop)
        exec_output += NeonXEqualRegOpExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonXExecDeclare.subst(substDict)

    def tbxTblInstX(name, Name, opClass, types, length, isTbl, rCount):
        global header_output, decoder_output, exec_output
        code = simd64EnabledCheckCode + '''
        union
        {
            uint8_t bytes[64];
            uint32_t regs[16];
        } table;

        union
        {
            uint8_t bytes[%(rCount)d * 4];
            uint32_t regs[%(rCount)d];
        } destReg, srcReg2;

        const unsigned length = %(length)d;
        const bool isTbl = %(isTbl)s;
        ''' % { "rCount" : rCount, "length" : length, "isTbl" : isTbl }
        for reg in range(rCount):
            code += '''
        srcReg2.regs[%(reg)d] = htole(AA64FpOp2P%(reg)d_uw);
        destReg.regs[%(reg)d] = htole(AA64FpDestP%(reg)d_uw);
        ''' % { "reg" : reg }
        for reg in range(16):
            if reg < length * 4:
                code += '''
        table.regs[%(reg)d] = htole(AA64FpOp1P%(p)dV%(v)dS_uw);
        ''' % { "reg" : reg, "p" : reg % 4, "v" : reg / 4 }
            else:
                code += '''
        table.regs[%(reg)d] = 0;
        ''' % { "reg" : reg }
        code += '''
        for (unsigned i = 0; i < sizeof(destReg); i++) {
            uint8_t index = srcReg2.bytes[i];
            if (index < 16 * length) {
                destReg.bytes[i] = table.bytes[index];
            } else {
                if (isTbl)
                    destReg.bytes[i] = 0;
                // else destReg.bytes[i] unchanged
            }
        }
        '''
        for reg in range(rCount):
            code += '''
        AA64FpDestP%(reg)d_uw = letoh(destReg.regs[%(reg)d]);
        ''' % { "reg" : reg }
        if rCount < 4:  # zero upper half
            for reg in range(rCount, 4):
                code += '''
        AA64FpDestP%(reg)d_uw = 0;
        ''' % { "reg" : reg }
        iop = ArmInstObjParams(name, Name, "DataX2RegOp",
                               { "code": code,
                                 "r_count": rCount,
                                 "op_class": opClass }, [])
        iop.snippets["code"] += zeroSveVecRegUpperPartCode % "AA64FpDest"
        header_output += NeonX2RegOpDeclare.subst(iop)
        exec_output += NeonXEqualRegOpExecute.subst(iop)
        for type in types:
            substDict = { "targs" : type,
                          "class_name" : Name }
            exec_output += NeonXExecDeclare.subst(substDict)

    # ABS
    absCode = '''
            if (srcElem1 < 0) {
                destElem = -srcElem1;
            } else {
                destElem = srcElem1;
            }
    '''
    twoEqualRegInstX("abs", "AbsDX", "SimdAluOp", signedTypes, 2, absCode)
    twoEqualRegInstX("abs", "AbsQX", "SimdAluOp", signedTypes, 4, absCode)
    # ADD
    addCode = "destElem = srcElem1 + srcElem2;"
    threeEqualRegInstX("add", "AddDX", "SimdAddOp", unsignedTypes, 2, addCode)
    threeEqualRegInstX("add", "AddQX", "SimdAddOp", unsignedTypes, 4, addCode)
    # ADDHN, ADDHN2
    addhnCode = '''
            destElem = ((BigElement)srcElem1 + (BigElement)srcElem2) >>
                        (sizeof(Element) * 8);
    '''
    threeRegNarrowInstX("addhn", "AddhnX", "SimdAddOp", smallUnsignedTypes,
                        addhnCode)
    threeRegNarrowInstX("addhn2", "Addhn2X", "SimdAddOp", smallUnsignedTypes,
                        addhnCode, hi=True)
    # ADDP (scalar)
    twoRegPairwiseScInstX("addp", "AddpScQX", "SimdAddOp", ("uint64_t",), 4,
                          addCode)
    # ADDP (vector)
    threeEqualRegInstX("addp", "AddpDX", "SimdAddOp", smallUnsignedTypes, 2,
                       addCode, pairwise=True)
    threeEqualRegInstX("addp", "AddpQX", "SimdAddOp", unsignedTypes, 4,
                       addCode, pairwise=True)
    # ADDV
    # Note: SimdAddOp can be a bit optimistic here
    addAcrossCode = "destElem += srcElem1;"
    twoRegAcrossInstX("addv", "AddvDX", "SimdAddOp", ("uint8_t", "uint16_t"),
                      2, addAcrossCode, False, False, True)
    twoRegAcrossInstX("addv", "AddvQX", "SimdAddOp", smallUnsignedTypes, 4,
                      addAcrossCode, False, False, True)
    # AND
    andCode = "destElem = srcElem1 & srcElem2;"
    threeEqualRegInstX("and", "AndDX", "SimdAluOp", ("uint64_t",), 2, andCode)
    threeEqualRegInstX("and", "AndQX", "SimdAluOp", ("uint64_t",), 4, andCode)
    # BIC (immediate)
    bicImmCode = "destElem &= ~imm;"
    oneRegImmInstX("bic", "BicImmDX", "SimdAluOp", ("uint64_t",), 2,
                   bicImmCode, True)
    oneRegImmInstX("bic", "BicImmQX", "SimdAluOp", ("uint64_t",), 4,
                   bicImmCode, True)
    # BIC (register)
    bicCode = "destElem = srcElem1 & ~srcElem2;"
    threeEqualRegInstX("bic", "BicDX", "SimdAluOp", ("uint64_t",), 2, bicCode)
    threeEqualRegInstX("bic", "BicQX", "SimdAluOp", ("uint64_t",), 4, bicCode)
    # BIF
    bifCode = "destElem = (destElem & srcElem2) | (srcElem1 & ~srcElem2);"
    threeEqualRegInstX("bif", "BifDX", "SimdAluOp", ("uint64_t",), 2, bifCode,
                       True)
    threeEqualRegInstX("bif", "BifQX", "SimdAluOp", ("uint64_t",), 4, bifCode,
                       True)
    # BIT
    bitCode = "destElem = (srcElem1 & srcElem2) | (destElem & ~srcElem2);"
    threeEqualRegInstX("bit", "BitDX", "SimdAluOp", ("uint64_t",), 2, bitCode,
                       True)
    threeEqualRegInstX("bit", "BitQX", "SimdAluOp", ("uint64_t",), 4, bitCode,
                       True)
    # BSL
    bslCode = "destElem = (srcElem1 & destElem) | (srcElem2 & ~destElem);"
    threeEqualRegInstX("bsl", "BslDX", "SimdAluOp", ("uint64_t",), 2, bslCode,
                       True)
    threeEqualRegInstX("bsl", "BslQX", "SimdAluOp", ("uint64_t",), 4, bslCode,
                       True)

    # FCADD
    fcaddCode = '''
        bool rot = bits(machInst, 12);
        Element el1;
        Element el3;
        for (int i = 0; i < eCount/2; ++i) {
            FPSCR fpscr = (FPSCR) FpscrExc;

            Element srcElem1_1 = letoh(srcReg1.elements[2*i]);
            Element srcElem1_2 = letoh(srcReg1.elements[2*i+1]);
            Element srcElem2_1 = letoh(srcReg2.elements[2*i]);
            Element srcElem2_2 = letoh(srcReg2.elements[2*i+1]);
            Element destElem_1;
            Element destElem_2;
            if (rot) {
                el1 = srcElem2_2;
                el3 = fplibNeg<Element>(srcElem2_1);
            } else {
                el1 = fplibNeg<Element>(srcElem2_2);
                el3 = srcElem2_1;
            }

            destElem_1 = fplibAdd<Element>(srcElem1_1, el1, fpscr);
            destElem_2 = fplibAdd<Element>(srcElem1_2, el3, fpscr);

            FpscrExc = fpscr;

            destReg.elements[2*i] = htole(destElem_1);
            destReg.elements[2*i+1] = htole(destElem_2);
         }
         '''

    threeEqualRegInstX("fcadd", "FcaddDX", "SimdFloatAddOp",
                            ("uint16_t", "uint32_t"), 2,
                            fcaddCode, complex=True)
    threeEqualRegInstX("fcadd", "FcaddQX", "SimdFloatAddOp", floatTypes, 4,
                       fcaddCode, complex=True)

    fcmlaCode = '''
        uint8_t rot = bits(machInst, %(rot)s);
        Element el1;
        Element el2;
        Element el3;
        Element el4;
        for (int i = 0; i < eCount/2; ++i) {
            FPSCR fpscr = (FPSCR) FpscrExc;

            Element srcElem1_1 = letoh(srcReg1.elements[2*i]);
            Element srcElem1_2 = letoh(srcReg1.elements[2*i+1]);
            Element srcElem2_1 = letoh(srcReg2.elements[2* %(index)s]);
            Element srcElem2_2 = letoh(srcReg2.elements[2* %(index)s +1]);
            Element destElem_1 = letoh(destReg.elements[2*i]);
            Element destElem_2 = letoh(destReg.elements[2*i+1]);

            switch (rot) {
              case 0x0:
                {
                  el1 = srcElem2_1;
                  el2 = srcElem1_1;
                  el3 = srcElem2_2;
                  el4 = srcElem1_1;
                  break;
                }
              case 0x1:
                {
                  el1 = fplibNeg<Element>(srcElem2_2);
                  el2 = srcElem1_2;
                  el3 = srcElem2_1;
                  el4 = srcElem1_2;
                  break;
                }
              case 0x2:
                {
                  el1 = fplibNeg<Element>(srcElem2_1);
                  el2 = srcElem1_1;
                  el3 = fplibNeg<Element>(srcElem2_2);
                  el4 = srcElem1_1;
                  break;
                }
              case 0x3:
                {
                  el1 = srcElem2_2;
                  el2 = srcElem1_2;
                  el3 = fplibNeg<Element>(srcElem2_1);
                  el4 = srcElem1_2;
                  break;
                }
            }
            destElem_1 = fplibMulAdd<Element>(destElem_1, el2, el1, fpscr);
            destElem_2 = fplibMulAdd<Element>(destElem_2, el4, el3, fpscr);

            FpscrExc = fpscr;

            destReg.elements[2*i] = htole(destElem_1);
            destReg.elements[2*i+1] = htole(destElem_2);
         }
         '''
    # FCMLA (by element)
    fcmla_imm = fcmlaCode % {'rot': '14, 13', 'index': 'imm'}
    threeEqualRegInstX("fcmla", "FcmlaElemDX", "SimdFloatMultAccOp",
                        ("uint16_t", "uint32_t"), 2, fcmla_imm, True,
                        byElem=True, complex=True)
    threeEqualRegInstX("fcmla", "FcmlaElemQX", "SimdFloatMultAccOp",
                       floatTypes, 4, fcmla_imm, True, byElem=True,
                       complex=True)
    # FCMLA (vector)
    fcmla_vec = fcmlaCode % {'rot': '12, 11', 'index': 'i'}
    threeEqualRegInstX("fcmla", "FcmlaDX", "SimdFloatMultAccOp",
                       ("uint16_t", "uint32_t"), 2, fcmla_vec, True,
                       complex=True)
    threeEqualRegInstX("fcmla", "FcmlaQX", "SimdFloatMultAccOp",
                       floatTypes, 4, fcmla_vec, True, complex=True)

    def intDotInst(name, Name, opClass,
                   destIsSigned, src1IsSigned, src2IsSigned,
                   rCount, byElem):
        destType = "int32_t" if destIsSigned else "uint32_t"
        src1Type = "int8_t" if src1IsSigned else "uint8_t"
        src2Type = "int8_t" if src2IsSigned else "uint8_t"
        dotCode = '''
        using Src1Element = %(src1Type)s;
        using Src2Element = %(src2Type)s;

        // Neon dot instructions always generate one output element
        // from 4 pairs of source elements.
        static_assert(sizeof(Element) == 4 * sizeof(Src1Element));
        static_assert(sizeof(Element) == 4 * sizeof(Src2Element));

        // Extended source element types to avoid overflow of intermediate
        // calculations.
        using ExtendedSrc1Element =
                typename vector_element_traits::
                    extend_element<Element, Src1Element>::type;
        using ExtendedSrc2Element =
                typename vector_element_traits::
                    extend_element<Element, Src2Element>::type;

        for (unsigned i = 0; i < eCount; ++i) {
            Element src1ElemsPacked = letoh(srcReg1.elements[i]);
            Element src2ElemsPacked = letoh(srcReg2.elements[%(src2Index)s]);

            Src1Element *src1Elems =
                reinterpret_cast<Src1Element*>(&src1ElemsPacked);
            Src2Element *src2Elems =
                reinterpret_cast<Src2Element*>(&src2ElemsPacked);

            // Dot instructions accumulate into the dest reg
            Element destElem = letoh(destReg.elements[i]);

            for (unsigned j = 0; j < 4; ++j) {
                ExtendedSrc1Element src1Elem =
                    static_cast<ExtendedSrc1Element>(src1Elems[j]);
                ExtendedSrc2Element src2Elem =
                    static_cast<ExtendedSrc2Element>(src2Elems[j]);
                destElem += src1Elem * src2Elem;
            }
            destReg.elements[i] = htole(destElem);
        }
        ''' % dict(src1Type=src1Type, src2Type=src2Type,
                   src2Index="imm" if byElem else "i")
        threeEqualRegInstX(name, Name, opClass, (destType,), rCount,
                           dotCode, readDest=True, byElem=byElem,
                           complex=True)

    # SDOT (vector)
    intDotInst('sdot', 'SdotDX', 'SimdAluOp', True, True, True, 2, False)
    intDotInst('sdot', 'SdotQX', 'SimdAluOp', True, True, True, 4, False)
    # SDOT (element)
    intDotInst('sdot', 'SdotElemDX', 'SimdAluOp', True, True, True, 2, True)
    intDotInst('sdot', 'SdotElemQX', 'SimdAluOp', True, True, True, 4, True)
    # UDOT (vector)
    intDotInst('udot', 'UdotDX', 'SimdAluOp', False, False, False, 2, False)
    intDotInst('udot', 'UdotQX', 'SimdAluOp', False, False, False, 4, False)
    # UDOT (element)
    intDotInst('udot', 'UdotElemDX', 'SimdAluOp', False, False, False, 2, True)
    intDotInst('udot', 'UdotElemQX', 'SimdAluOp', False, False, False, 4, True)
    # SUDOT (element)
    intDotInst('sudot', 'SudotElemDX', 'SimdAluOp', True, True, False, 2, True)
    intDotInst('sudot', 'SudotElemQX', 'SimdAluOp', True, True, False, 4, True)
    # USDOT (vector)
    intDotInst('usdot', 'UsdotDX', 'SimdAluOp', True, False, True, 2, False)
    intDotInst('usdot', 'UsdotQX', 'SimdAluOp', True, False, True, 4, False)
    # USDOT (element)
    intDotInst('usdot', 'UsdotElemDX', 'SimdAluOp', True, False, True, 2, True)
    intDotInst('usdot', 'UsdotElemQX', 'SimdAluOp', True, False, True, 4, True)

    def intMatMulInst(name, Name, opClass,
                      destIsSigned, src1IsSigned, src2IsSigned):
        destType = "int32_t" if destIsSigned else "uint32_t"
        src1Type = "int8_t" if src1IsSigned else "uint8_t"
        src2Type = "int8_t" if src2IsSigned else "uint8_t"
        matMulCode = '''
        using Src1Element = %(src1Type)s;
        using Src2Element = %(src2Type)s;

        // Neon MM instructions always generate four output elements
        // from 16 pairs of source elements.
        static_assert(sizeof(Element) == 4 * sizeof(Src1Element));
        static_assert(sizeof(Element) == 4 * sizeof(Src2Element));

        // Extended source element types to avoid overflow of intermediate
        // calculations.
        using ExtendedSrc1Element =
                typename vector_element_traits::
                    extend_element<Element, Src1Element>::type;
        using ExtendedSrc2Element =
                typename vector_element_traits::
                    extend_element<Element, Src2Element>::type;

        // Properties of the matrices
        constexpr unsigned destMatSize = 2; // Dest Matrices are dim 2x2
        constexpr unsigned K = 8;           // Src matrices are dim 2x8 & 8x2

        constexpr unsigned eltsPerMatrix = destMatSize * destMatSize;

        Element destMat[eltsPerMatrix] = {0};
        for (unsigned j = 0; j < eltsPerMatrix; ++j) {
            destMat[j] = letoh(destReg.elements[j]);
        }

        Element src1MatPacked[eltsPerMatrix] = {0};
        Element src2MatPacked[eltsPerMatrix] = {0};
        for (unsigned j = 0; j < eltsPerMatrix; ++j) {
            src1MatPacked[j] = letoh(srcReg1.elements[j]);
            src2MatPacked[j] = letoh(srcReg2.elements[j]);
        }

        Src1Element *src1Mat =
            reinterpret_cast<Src1Element*>(&src1MatPacked);
        Src2Element *src2Mat =
            reinterpret_cast<Src2Element*>(&src2MatPacked);

        unsigned destEltIdx = 0;
        for (unsigned rowIdx = 0; rowIdx < destMatSize; ++rowIdx) {
            for (unsigned colIdx = 0; colIdx < destMatSize; ++colIdx) {
                Element destElem = destMat[destEltIdx];
                for (unsigned k = 0; k < K; ++k) {
                    const ExtendedSrc1Element src1Elem =
                        static_cast<ExtendedSrc1Element>
                                            (src1Mat[K * rowIdx + k]);
                    const ExtendedSrc2Element src2Elem =
                        static_cast<ExtendedSrc2Element>
                                            (src2Mat[K * colIdx + k]);

                    destElem += src1Elem * src2Elem;
                }
                destMat[destEltIdx++] = destElem;
            }
        }

        for (unsigned j = 0; j < eltsPerMatrix; ++j) {
            destReg.elements[j] = htole(destMat[j]);
        }
        ''' % dict(src1Type=src1Type, src2Type=src2Type)
        threeEqualRegInstX(name, Name, opClass, (destType,), 4,
                           matMulCode, readDest=True, byElem=False,
                           complex=True)

    # SMMLA
    intMatMulInst('smmla', 'SmmlaQX', 'SimdMatMultAccOp', True, True, True)
    # USMMLA
    intMatMulInst('usmmla', 'UsmmlaQX', 'SimdMatMultAccOp', True, False, True)
    # UMMLA
    intMatMulInst('ummla', 'UmmlaQX', 'SimdMatMultAccOp', False, False, False)

    # CLS
    clsCode = '''
            unsigned count = 0;
            if (srcElem1 < 0) {
                srcElem1 <<= 1;
                while (srcElem1 < 0 && count < sizeof(Element) * 8 - 1) {
                    count++;
                    srcElem1 <<= 1;
                }
            } else {
                srcElem1 <<= 1;
                while (srcElem1 >= 0 && count < sizeof(Element) * 8 - 1) {
                    count++;
                    srcElem1 <<= 1;
                }
            }
            destElem = count;
    '''
    twoEqualRegInstX("cls", "ClsDX", "SimdAluOp", smallSignedTypes, 2, clsCode)
    twoEqualRegInstX("cls", "ClsQX", "SimdAluOp", smallSignedTypes, 4, clsCode)
    # CLZ
    clzCode = '''
            unsigned count = 0;
            while (srcElem1 >= 0 && count < sizeof(Element) * 8) {
                count++;
                srcElem1 <<= 1;
            }
            destElem = count;
    '''
    twoEqualRegInstX("clz", "ClzDX", "SimdAluOp", smallSignedTypes, 2, clzCode)
    twoEqualRegInstX("clz", "ClzQX", "SimdAluOp", smallSignedTypes, 4, clzCode)
    # CMEQ (register)
    cmeqCode = "destElem = (srcElem1 == srcElem2) ? (Element)(-1) : 0;"
    threeEqualRegInstX("cmeq", "CmeqDX", "SimdCmpOp", unsignedTypes, 2,
                       cmeqCode)
    threeEqualRegInstX("cmeq", "CmeqQX", "SimdCmpOp", unsignedTypes, 4,
                       cmeqCode)
    # CMEQ (zero)
    cmeqZeroCode = "destElem = (srcElem1 == 0) ? (Element)(-1) : 0;"
    twoEqualRegInstX("cmeq", "CmeqZeroDX", "SimdCmpOp", signedTypes, 2,
                     cmeqZeroCode)
    twoEqualRegInstX("cmeq", "CmeqZeroQX", "SimdCmpOp", signedTypes, 4,
                     cmeqZeroCode)
    # CMGE (register)
    cmgeCode = "destElem = (srcElem1 >= srcElem2) ? (Element)(-1) : 0;"
    threeEqualRegInstX("cmge", "CmgeDX", "SimdCmpOp", signedTypes, 2, cmgeCode)
    threeEqualRegInstX("cmge", "CmgeQX", "SimdCmpOp", signedTypes, 4, cmgeCode)
    # CMGE (zero)
    cmgeZeroCode = "destElem = (srcElem1 >= 0) ? (Element)(-1) : 0;"
    twoEqualRegInstX("cmge", "CmgeZeroDX", "SimdCmpOp", signedTypes, 2,
                     cmgeZeroCode)
    twoEqualRegInstX("cmge", "CmgeZeroQX", "SimdCmpOp", signedTypes, 4,
                     cmgeZeroCode)
    # CMGT (register)
    cmgtCode = "destElem = (srcElem1 > srcElem2) ? (Element)(-1) : 0;"
    threeEqualRegInstX("cmgt", "CmgtDX", "SimdCmpOp", signedTypes, 2, cmgtCode)
    threeEqualRegInstX("cmgt", "CmgtQX", "SimdCmpOp", signedTypes, 4, cmgtCode)
    # CMGT (zero)
    cmgtZeroCode = "destElem = (srcElem1 > 0) ? (Element)(-1) : 0;"
    twoEqualRegInstX("cmgt", "CmgtZeroDX", "SimdCmpOp", signedTypes, 2,
                     cmgtZeroCode)
    twoEqualRegInstX("cmgt", "CmgtZeroQX", "SimdCmpOp", signedTypes, 4,
                     cmgtZeroCode)
    # CMHI (register)
    threeEqualRegInstX("cmhi", "CmhiDX", "SimdCmpOp", unsignedTypes, 2,
                       cmgtCode)
    threeEqualRegInstX("cmhi", "CmhiQX", "SimdCmpOp", unsignedTypes, 4,
                       cmgtCode)
    # CMHS (register)
    threeEqualRegInstX("cmhs", "CmhsDX", "SimdCmpOp", unsignedTypes, 2,
                       cmgeCode)
    threeEqualRegInstX("cmhs", "CmhsQX", "SimdCmpOp", unsignedTypes, 4,
                       cmgeCode)
    # CMLE (zero)
    cmleZeroCode = "destElem = (srcElem1 <= 0) ? (Element)(-1) : 0;"
    twoEqualRegInstX("cmle", "CmleZeroDX", "SimdCmpOp", signedTypes, 2,
                     cmleZeroCode)
    twoEqualRegInstX("cmle", "CmleZeroQX", "SimdCmpOp", signedTypes, 4,
                     cmleZeroCode)
    # CMLT (zero)
    cmltZeroCode = "destElem = (srcElem1 < 0) ? (Element)(-1) : 0;"
    twoEqualRegInstX("cmlt", "CmltZeroDX", "SimdCmpOp", signedTypes, 2,
                     cmltZeroCode)
    twoEqualRegInstX("cmlt", "CmltZeroQX", "SimdCmpOp", signedTypes, 4,
                     cmltZeroCode)
    # CMTST (register)
    tstCode = "destElem = (srcElem1 & srcElem2) ? (Element)(-1) : 0;"
    threeEqualRegInstX("cmtst", "CmtstDX", "SimdAluOp", unsignedTypes, 2,
                       tstCode)
    threeEqualRegInstX("cmtst", "CmtstQX", "SimdAluOp", unsignedTypes, 4,
                       tstCode)
    # CNT
    cntCode = '''
            unsigned count = 0;
            while (srcElem1 && count < sizeof(Element) * 8) {
                count += srcElem1 & 0x1;
                srcElem1 >>= 1;
            }
            destElem = count;
    '''
    twoEqualRegInstX("cnt", "CntDX", "SimdAluOp", ("uint8_t",), 2, cntCode)
    twoEqualRegInstX("cnt", "CntQX", "SimdAluOp", ("uint8_t",), 4, cntCode)
    # DUP (element)
    dupCode = "destElem = srcElem1;"
    twoEqualRegInstX("dup", "DupElemDX", "SimdMiscOp", smallUnsignedTypes, 2,
                     dupCode, isDup=True, byElem=True)
    twoEqualRegInstX("dup", "DupElemQX", "SimdMiscOp", unsignedTypes, 4,
                     dupCode, isDup=True, byElem=True)
    twoEqualRegInstX("dup", "DupElemScX", "SimdMiscOp", unsignedTypes, 4,
                     dupCode, isDup=True, byElem=True, scalar=True)
    # DUP (general register)
    dupGprInstX("dup", "DupGprWDX", "SimdMiscOp", smallUnsignedTypes, 2, 'W')
    dupGprInstX("dup", "DupGprWQX", "SimdMiscOp", smallUnsignedTypes, 4, 'W')
    dupGprInstX("dup", "DupGprXQX", "SimdMiscOp", ("uint64_t",), 4, 'X')
    # EOR
    eorCode = "destElem = srcElem1 ^ srcElem2;"
    threeEqualRegInstX("eor", "EorDX", "SimdAluOp", ("uint64_t",), 2, eorCode)
    threeEqualRegInstX("eor", "EorQX", "SimdAluOp", ("uint64_t",), 4, eorCode)
    # EXT
    extCode = '''
            for (unsigned i = 0; i < eCount; i++) {
                unsigned index = i + imm;
                if (index < eCount) {
                    destReg.elements[i] = srcReg1.elements[index];
                } else {
                    index -= eCount;
                    if (index >= eCount) {
                        fault = std::make_shared<UndefinedInstruction>(
                                      machInst, false, mnemonic);
                    } else {
                        destReg.elements[i] = srcReg2.elements[index];
                    }
                }
            }
    '''
    extInstX("Ext", "ExtDX", "SimdMiscOp", ("uint8_t",), 2, extCode)
    extInstX("Ext", "ExtQX", "SimdMiscOp", ("uint8_t",), 4, extCode)
    # FABD
    fpOp = '''
            FPSCR fpscr = (FPSCR) FpscrExc;
            destElem = %s;
            FpscrExc = fpscr;
    '''
    fabdCode = fpOp % "fplibAbs<Element>(fplibSub(srcElem1, srcElem2, fpscr))"
    threeEqualRegInstX("fabd", "FabdDX", "SimdFloatAddOp", smallFloatTypes, 2,
                       fabdCode)
    threeEqualRegInstX("fabd", "FabdQX", "SimdFloatAddOp", floatTypes, 4,
                       fabdCode)
    threeEqualRegInstX("fabd", "FabdScX", "SimdFloatAddOp", floatTypes, 4,
                       fabdCode, scalar=True)
    # FABS
    fabsCode = fpOp % "fplibAbs<Element>(srcElem1)"
    twoEqualRegInstX("Abs", "FabsDX", "SimdFloatAluOp", smallFloatTypes, 2,
                     fabsCode)
    twoEqualRegInstX("Abs", "FabsQX", "SimdFloatAluOp", floatTypes, 4,
                     fabsCode)
    # FACGE
    fpCmpAbsOp = fpOp % ("fplibCompare%s<Element>(fplibAbs<Element>(srcElem1),"
                         " fplibAbs<Element>(srcElem2), fpscr) ? -1 : 0")
    facgeCode = fpCmpAbsOp % "GE"
    threeEqualRegInstX("facge", "FacgeDX", "SimdFloatCmpOp", smallFloatTypes,
                       2, facgeCode)
    threeEqualRegInstX("facge", "FacgeQX", "SimdFloatCmpOp", floatTypes, 4,
                       facgeCode)
    threeEqualRegInstX("facge", "FacgeScX", "SimdFloatCmpOp", floatTypes, 4,
                       facgeCode, scalar=True)
    # FACGT
    facgtCode = fpCmpAbsOp % "GT"
    threeEqualRegInstX("facgt", "FacgtDX", "SimdFloatCmpOp", smallFloatTypes,
                       2, facgtCode)
    threeEqualRegInstX("facgt", "FacgtQX", "SimdFloatCmpOp", floatTypes, 4,
                       facgtCode)
    threeEqualRegInstX("facgt", "FacgtScX", "SimdFloatCmpOp", floatTypes, 4,
                       facgtCode, scalar=True)
    # FADD
    fpBinOp = fpOp % "fplib%s<Element>(srcElem1, srcElem2, fpscr)"
    faddCode = fpBinOp % "Add"
    threeEqualRegInstX("fadd", "FaddDX", "SimdFloatAddOp", smallFloatTypes, 2,
                       faddCode)
    threeEqualRegInstX("fadd", "FaddQX", "SimdFloatAddOp", floatTypes, 4,
                       faddCode)
    # FADDP (scalar)
    twoRegPairwiseScInstX("faddp", "FaddpScSX", "SimdFloatAddOp",
                          ("uint16_t",), 1, faddCode)
    twoRegPairwiseScInstX("faddp", "FaddpScDX", "SimdFloatAddOp",
                          ("uint32_t",), 2, faddCode)
    twoRegPairwiseScInstX("faddp", "FaddpScQX", "SimdFloatAddOp",
                          ("uint64_t",), 4, faddCode)
    # FADDP (vector)
    threeEqualRegInstX("faddp", "FaddpDX", "SimdFloatAddOp", smallFloatTypes,
                       2, faddCode, pairwise=True)
    threeEqualRegInstX("faddp", "FaddpQX", "SimdFloatAddOp", floatTypes, 4,
                       faddCode, pairwise=True)
    # FCMEQ (register)
    fpCmpOp = fpOp % ("fplibCompare%s<Element>(srcElem1, srcElem2, fpscr) ?"
                      " -1 : 0")
    fcmeqCode = fpCmpOp % "EQ"
    threeEqualRegInstX("fcmeq", "FcmeqDX", "SimdFloatCmpOp", smallFloatTypes,
                       2, fcmeqCode)
    threeEqualRegInstX("fcmeq", "FcmeqQX", "SimdFloatCmpOp", floatTypes, 4,
                       fcmeqCode)
    threeEqualRegInstX("fcmeq", "FcmeqScX", "SimdFloatCmpOp", floatTypes, 4,
                       fcmeqCode, scalar=True)
    # FCMEQ (zero)
    fpCmpZeroOp = fpOp % "fplibCompare%s<Element>(srcElem1, 0, fpscr) ? -1 : 0"
    fcmeqZeroCode = fpCmpZeroOp % "EQ"
    twoEqualRegInstX("fcmeq", "FcmeqZeroDX", "SimdFloatCmpOp", smallFloatTypes,
                     2, fcmeqZeroCode)
    twoEqualRegInstX("fcmeq", "FcmeqZeroQX", "SimdFloatCmpOp", floatTypes, 4,
                     fcmeqZeroCode)
    twoEqualRegInstX("fcmeq", "FcmeqZeroScX", "SimdFloatCmpOp", floatTypes, 4,
                     fcmeqZeroCode, scalar=True)
    # FCMGE (register)
    fcmgeCode = fpCmpOp % "GE"
    threeEqualRegInstX("fcmge", "FcmgeDX", "SimdFloatCmpOp", smallFloatTypes,
                       2, fcmgeCode)
    threeEqualRegInstX("fcmge", "FcmgeQX", "SimdFloatCmpOp", floatTypes, 4,
                       fcmgeCode)
    threeEqualRegInstX("fcmge", "FcmgeScX", "SimdFloatCmpOp", floatTypes, 4,
                       fcmgeCode, scalar=True)
    # FCMGE (zero)
    fcmgeZeroCode = fpCmpZeroOp % "GE"
    twoEqualRegInstX("fcmge", "FcmgeZeroDX", "SimdFloatCmpOp", smallFloatTypes,
                     2, fcmgeZeroCode)
    twoEqualRegInstX("fcmge", "FcmgeZeroQX", "SimdFloatCmpOp", floatTypes, 4,
                     fcmgeZeroCode)
    twoEqualRegInstX("fcmge", "FcmgeZeroScX", "SimdFloatCmpOp", floatTypes, 4,
                     fcmgeZeroCode, scalar=True)
    # FCMGT (register)
    fcmgtCode = fpCmpOp % "GT"
    threeEqualRegInstX("fcmgt", "FcmgtDX", "SimdFloatCmpOp", smallFloatTypes,
                       2, fcmgtCode)
    threeEqualRegInstX("fcmgt", "FcmgtQX", "SimdFloatCmpOp", floatTypes, 4,
                       fcmgtCode)
    threeEqualRegInstX("fcmgt", "FcmgtScX", "SimdFloatCmpOp", floatTypes, 4,
                       fcmgtCode, scalar=True)
    # FCMGT (zero)
    fcmgtZeroCode = fpCmpZeroOp % "GT"
    twoEqualRegInstX("fcmgt", "FcmgtZeroDX", "SimdFloatCmpOp", smallFloatTypes,
                     2, fcmgtZeroCode)
    twoEqualRegInstX("fcmgt", "FcmgtZeroQX", "SimdFloatCmpOp", floatTypes, 4,
                     fcmgtZeroCode)
    twoEqualRegInstX("fcmgt", "FcmgtZeroScX", "SimdFloatCmpOp", floatTypes, 4,
                     fcmgtZeroCode, scalar=True)
    # FCMLE (zero)
    fpCmpRevZeroOp = fpOp % ("fplibCompare%s<Element>(0, srcElem1, fpscr) ?"
                             " -1 : 0")
    fcmleZeroCode = fpCmpRevZeroOp % "GE"
    twoEqualRegInstX("fcmle", "FcmleZeroDX", "SimdFloatCmpOp", smallFloatTypes,
                     2, fcmleZeroCode)
    twoEqualRegInstX("fcmle", "FcmleZeroQX", "SimdFloatCmpOp", floatTypes, 4,
                     fcmleZeroCode)
    twoEqualRegInstX("fcmle", "FcmleZeroScX", "SimdFloatCmpOp", floatTypes, 4,
                     fcmleZeroCode, scalar=True)
    # FCMLT (zero)
    fcmltZeroCode = fpCmpRevZeroOp % "GT"
    twoEqualRegInstX("fcmlt", "FcmltZeroDX", "SimdFloatCmpOp", smallFloatTypes,
                     2, fcmltZeroCode)
    twoEqualRegInstX("fcmlt", "FcmltZeroQX", "SimdFloatCmpOp", floatTypes, 4,
                     fcmltZeroCode)
    twoEqualRegInstX("fcmlt", "FcmltZeroScX", "SimdFloatCmpOp", floatTypes, 4,
                     fcmltZeroCode, scalar=True)
    # FCVTAS
    fcvtCode = fpOp % ("fplibFPToFixed<Element, Element>("
                       "srcElem1, %s, %s, %s, fpscr)")
    fcvtasCode = fcvtCode % ("0", "false", "FPRounding_TIEAWAY")
    twoEqualRegInstX("fcvtas", "FcvtasDX", "SimdCvtOp", smallFloatTypes, 2,
                     fcvtasCode)
    twoEqualRegInstX("fcvtas", "FcvtasQX", "SimdCvtOp", floatTypes, 4,
                     fcvtasCode)
    twoEqualRegInstX("fcvtas", "FcvtasScX", "SimdCvtOp", floatTypes, 4,
                     fcvtasCode, scalar=True)
    # FCVTAU
    fcvtauCode = fcvtCode % ("0", "true", "FPRounding_TIEAWAY")
    twoEqualRegInstX("fcvtau", "FcvtauDX", "SimdCvtOp", smallFloatTypes, 2,
                     fcvtauCode)
    twoEqualRegInstX("fcvtau", "FcvtauQX", "SimdCvtOp", floatTypes, 4,
                     fcvtauCode)
    twoEqualRegInstX("fcvtau", "FcvtauScX", "SimdCvtOp", floatTypes, 4,
                     fcvtauCode, scalar=True)
    # FCVTL, FCVTL2
    fcvtlCode = fpOp % ("fplibConvert<Element, BigElement>("
                        "srcElem1, FPCRRounding(fpscr), fpscr)")
    twoRegLongInstX("fcvtl", "FcvtlX", "SimdCvtOp", ("uint16_t", "uint32_t"),
                    fcvtlCode)
    twoRegLongInstX("fcvtl", "Fcvtl2X", "SimdCvtOp", ("uint16_t", "uint32_t"),
                    fcvtlCode, hi=True)
    # FCVTMS
    fcvtmsCode = fcvtCode % ("0", "false", "FPRounding_NEGINF")
    twoEqualRegInstX("fcvtms", "FcvtmsDX", "SimdCvtOp", smallFloatTypes, 2,
                     fcvtmsCode)
    twoEqualRegInstX("fcvtms", "FcvtmsQX", "SimdCvtOp", floatTypes, 4,
                     fcvtmsCode)
    twoEqualRegInstX("fcvtms", "FcvtmsScX", "SimdCvtOp", floatTypes, 4,
                     fcvtmsCode, scalar=True)
    # FCVTMU
    fcvtmuCode = fcvtCode % ("0", "true", "FPRounding_NEGINF")
    twoEqualRegInstX("fcvtmu", "FcvtmuDX", "SimdCvtOp", smallFloatTypes, 2,
                     fcvtmuCode)
    twoEqualRegInstX("fcvtmu", "FcvtmuQX", "SimdCvtOp", floatTypes, 4,
                     fcvtmuCode)
    twoEqualRegInstX("fcvtmu", "FcvtmuScX", "SimdCvtOp", floatTypes, 4,
                     fcvtmuCode, scalar=True)
    # FCVTN, FCVTN2
    fcvtnCode = fpOp % ("fplibConvert<BigElement, Element>("
                        "srcElem1, FPCRRounding(fpscr), fpscr)")
    twoRegNarrowInstX("fcvtn", "FcvtnX", "SimdCvtOp",
                      ("uint16_t", "uint32_t"), fcvtnCode)
    twoRegNarrowInstX("fcvtn", "Fcvtn2X", "SimdCvtOp",
                      ("uint16_t", "uint32_t"), fcvtnCode, hi=True)
    # FCVTNS
    fcvtnsCode = fcvtCode % ("0", "false", "FPRounding_TIEEVEN")
    twoEqualRegInstX("fcvtns", "FcvtnsDX", "SimdCvtOp", smallFloatTypes, 2,
                     fcvtnsCode)
    twoEqualRegInstX("fcvtns", "FcvtnsQX", "SimdCvtOp", floatTypes, 4,
                     fcvtnsCode)
    twoEqualRegInstX("fcvtns", "FcvtnsScX", "SimdCvtOp", floatTypes, 4,
                     fcvtnsCode, scalar=True)
    # FCVTNU
    fcvtnuCode = fcvtCode % ("0", "true", "FPRounding_TIEEVEN")
    twoEqualRegInstX("fcvtnu", "FcvtnuDX", "SimdCvtOp", smallFloatTypes, 2,
                     fcvtnuCode)
    twoEqualRegInstX("fcvtnu", "FcvtnuQX", "SimdCvtOp", floatTypes, 4,
                     fcvtnuCode)
    twoEqualRegInstX("fcvtnu", "FcvtnuScX", "SimdCvtOp", floatTypes, 4,
                     fcvtnuCode, scalar=True)
    # FCVTPS
    fcvtpsCode = fcvtCode % ("0", "false", "FPRounding_POSINF")
    twoEqualRegInstX("fcvtps", "FcvtpsDX", "SimdCvtOp", smallFloatTypes, 2,
                     fcvtpsCode)
    twoEqualRegInstX("fcvtps", "FcvtpsQX", "SimdCvtOp", floatTypes, 4,
                     fcvtpsCode)
    twoEqualRegInstX("fcvtps", "FcvtpsScX", "SimdCvtOp", floatTypes, 4,
                     fcvtpsCode, scalar=True)
    # FCVTPU
    fcvtpuCode = fcvtCode % ("0", "true", "FPRounding_POSINF")
    twoEqualRegInstX("fcvtpu", "FcvtpuDX", "SimdCvtOp", smallFloatTypes, 2,
                     fcvtpuCode)
    twoEqualRegInstX("fcvtpu", "FcvtpuQX", "SimdCvtOp", floatTypes, 4,
                     fcvtpuCode)
    twoEqualRegInstX("fcvtpu", "FcvtpuScX", "SimdCvtOp", floatTypes, 4,
                     fcvtpuCode, scalar=True)
    # FCVTXN, FCVTXN2
    fcvtxnCode = fpOp % ("fplibConvert<BigElement, Element>("
                         "srcElem1, FPRounding_ODD, fpscr)")
    twoRegNarrowInstX("fcvtxn", "FcvtxnX", "SimdCvtOp", smallFloatTypes,
                      fcvtxnCode)
    twoRegNarrowInstX("fcvtxn", "Fcvtxn2X", "SimdCvtOp", smallFloatTypes,
                      fcvtxnCode, hi=True)
    twoRegNarrowInstX("fcvtxn", "FcvtxnScX", "SimdCvtOp", smallFloatTypes,
                      fcvtxnCode, scalar=True)
    # FCVTZS (fixed-point)
    fcvtzsCode = fcvtCode % ("imm", "false", "FPRounding_ZERO")
    twoEqualRegInstX("fcvtzs", "FcvtzsFixedDX", "SimdCvtOp", smallFloatTypes,
                     2, fcvtzsCode, hasImm=True)
    twoEqualRegInstX("fcvtzs", "FcvtzsFixedQX", "SimdCvtOp", floatTypes, 4,
                     fcvtzsCode, hasImm=True)
    twoEqualRegInstX("fcvtzs", "FcvtzsFixedScX", "SimdCvtOp", floatTypes, 4,
                     fcvtzsCode, hasImm=True, scalar=True)
    # FCVTZS (integer)
    fcvtzsIntCode = fcvtCode % ("0", "false", "FPRounding_ZERO")
    twoEqualRegInstX("fcvtzs", "FcvtzsIntDX", "SimdCvtOp", smallFloatTypes,
                     2, fcvtzsIntCode)
    twoEqualRegInstX("fcvtzs", "FcvtzsIntQX", "SimdCvtOp", floatTypes, 4,
                     fcvtzsIntCode)
    twoEqualRegInstX("fcvtzs", "FcvtzsIntScX", "SimdCvtOp", floatTypes, 4,
                     fcvtzsIntCode, scalar=True)
    # FCVTZU (fixed-point)
    fcvtzuCode = fcvtCode % ("imm", "true", "FPRounding_ZERO")
    twoEqualRegInstX("fcvtzu", "FcvtzuFixedDX", "SimdCvtOp", smallFloatTypes,
                     2, fcvtzuCode, hasImm=True)
    twoEqualRegInstX("fcvtzu", "FcvtzuFixedQX", "SimdCvtOp", floatTypes, 4,
                     fcvtzuCode, hasImm=True)
    twoEqualRegInstX("fcvtzu", "FcvtzuFixedScX", "SimdCvtOp", floatTypes, 4,
                     fcvtzuCode, hasImm=True, scalar=True)
    # FCVTZU (integer)
    fcvtzuIntCode = fcvtCode % ("0", "true", "FPRounding_ZERO")
    twoEqualRegInstX("fcvtzu", "FcvtzuIntDX", "SimdCvtOp", smallFloatTypes, 2,
                     fcvtzuIntCode)
    twoEqualRegInstX("fcvtzu", "FcvtzuIntQX", "SimdCvtOp", floatTypes, 4,
                     fcvtzuIntCode)
    twoEqualRegInstX("fcvtzu", "FcvtzuIntScX", "SimdCvtOp", floatTypes, 4,
                     fcvtzuIntCode, scalar=True)
    # FDIV
    fdivCode = fpBinOp % "Div"
    threeEqualRegInstX("fdiv", "FdivDX", "SimdFloatDivOp", smallFloatTypes, 2,
                       fdivCode)
    threeEqualRegInstX("fdiv", "FdivQX", "SimdFloatDivOp", floatTypes, 4,
                       fdivCode)
    # FMAX
    fmaxCode = fpBinOp % "Max"
    threeEqualRegInstX("fmax", "FmaxDX", "SimdFloatCmpOp", smallFloatTypes, 2,
                       fmaxCode)
    threeEqualRegInstX("fmax", "FmaxQX", "SimdFloatCmpOp", floatTypes, 4,
                       fmaxCode)
    # FMAXNM
    fmaxnmCode = fpBinOp % "MaxNum"
    threeEqualRegInstX("fmaxnm", "FmaxnmDX", "SimdFloatCmpOp", smallFloatTypes,
                       2, fmaxnmCode)
    threeEqualRegInstX("fmaxnm", "FmaxnmQX", "SimdFloatCmpOp", floatTypes, 4,
                       fmaxnmCode)
    # FMAXNMP (scalar)
    twoRegPairwiseScInstX("fmaxnmp", "FmaxnmpScSX", "SimdFloatCmpOp",
                          ("uint16_t",), 1, fmaxnmCode)
    twoRegPairwiseScInstX("fmaxnmp", "FmaxnmpScDX", "SimdFloatCmpOp",
                          ("uint32_t",), 2, fmaxnmCode)
    twoRegPairwiseScInstX("fmaxnmp", "FmaxnmpScQX", "SimdFloatCmpOp",
                          ("uint64_t",), 4, fmaxnmCode)
    # FMAXNMP (vector)
    threeEqualRegInstX("fmaxnmp", "FmaxnmpDX", "SimdFloatCmpOp",
                       smallFloatTypes, 2, fmaxnmCode, pairwise=True)
    threeEqualRegInstX("fmaxnmp", "FmaxnmpQX", "SimdFloatCmpOp", floatTypes, 4,
                       fmaxnmCode, pairwise=True)
    # FMAXNMV
    # Note: SimdFloatCmpOp can be a bit optimistic here
    fpAcrossOp = fpOp % "fplib%s<Element>(destElem, srcElem1, fpscr)"
    fmaxnmAcrossCode = fpAcrossOp % "MaxNum"
    twoRegAcrossInstX("fmaxnmv", "FmaxnmvDX", "SimdFloatCmpOp",
                      ("uint16_t", "uint32_t"), 2, fmaxnmAcrossCode,
                      False, False, True)
    twoRegAcrossInstX("fmaxnmv", "FmaxnmvQX", "SimdFloatCmpOp",
                      ("uint16_t", "uint32_t"), 4, fmaxnmAcrossCode,
                      False, False, True)
    # FMAXP (scalar)
    twoRegPairwiseScInstX("fmaxp", "FmaxpScSX", "SimdFloatCmpOp",
                          ("uint16_t",), 1, fmaxCode)
    twoRegPairwiseScInstX("fmaxp", "FmaxpScDX", "SimdFloatCmpOp",
                          ("uint32_t",), 2, fmaxCode)
    twoRegPairwiseScInstX("fmaxp", "FmaxpScQX", "SimdFloatCmpOp",
                          ("uint64_t",), 4, fmaxCode)
    # FMAXP (vector)
    threeEqualRegInstX("fmaxp", "FmaxpDX", "SimdFloatCmpOp", smallFloatTypes,
                       2, fmaxCode, pairwise=True)
    threeEqualRegInstX("fmaxp", "FmaxpQX", "SimdFloatCmpOp", floatTypes, 4,
                       fmaxCode, pairwise=True)
    # FMAXV
    # Note: SimdFloatCmpOp can be a bit optimistic here
    fmaxAcrossCode = fpAcrossOp % "Max"
    twoRegAcrossInstX("fmaxv", "FmaxvDX", "SimdFloatCmpOp",
                      ("uint16_t", "uint32_t"), 2,
                      fmaxAcrossCode, False, False, True)
    twoRegAcrossInstX("fmaxv", "FmaxvQX", "SimdFloatCmpOp",
                      ("uint16_t", "uint32_t"), 4,
                      fmaxAcrossCode, False, False, True)
    # FMIN
    fminCode = fpBinOp % "Min"
    threeEqualRegInstX("fmin", "FminDX", "SimdFloatCmpOp", smallFloatTypes, 2,
                       fminCode)
    threeEqualRegInstX("fmin", "FminQX", "SimdFloatCmpOp", floatTypes, 4,
                       fminCode)
    # FMINNM
    fminnmCode = fpBinOp % "MinNum"
    threeEqualRegInstX("fminnm", "FminnmDX", "SimdFloatCmpOp", smallFloatTypes,
                       2, fminnmCode)
    threeEqualRegInstX("fminnm", "FminnmQX", "SimdFloatCmpOp", floatTypes, 4,
                       fminnmCode)
    # FMINNMP (scalar)
    twoRegPairwiseScInstX("fminnmp", "FminnmpScSX", "SimdFloatCmpOp",
                          ("uint16_t",), 1, fminnmCode)
    twoRegPairwiseScInstX("fminnmp", "FminnmpScDX", "SimdFloatCmpOp",
                          ("uint32_t",), 2, fminnmCode)
    twoRegPairwiseScInstX("fminnmp", "FminnmpScQX", "SimdFloatCmpOp",
                          ("uint64_t",), 4, fminnmCode)
    # FMINNMP (vector)
    threeEqualRegInstX("fminnmp", "FminnmpDX", "SimdFloatCmpOp",
                       smallFloatTypes, 2, fminnmCode, pairwise=True)
    threeEqualRegInstX("fminnmp", "FminnmpQX", "SimdFloatCmpOp", floatTypes, 4,
                       fminnmCode, pairwise=True)
    # FMINNMV
    # Note: SimdFloatCmpOp can be a bit optimistic here
    fminnmAcrossCode = fpAcrossOp % "MinNum"
    twoRegAcrossInstX("fminnmv", "FminnmvDX", "SimdFloatCmpOp",
                      ("uint16_t", "uint32_t"), 2, fminnmAcrossCode,
                      False, False, True)
    twoRegAcrossInstX("fminnmv", "FminnmvQX", "SimdFloatCmpOp",
                      ("uint16_t", "uint32_t"), 4, fminnmAcrossCode,
                      False, False, True)
    # FMINP (scalar)
    twoRegPairwiseScInstX("fminp", "FminpScSX", "SimdFloatCmpOp",
                          ("uint16_t",), 1, fminCode)
    twoRegPairwiseScInstX("fminp", "FminpScDX", "SimdFloatCmpOp",
                          ("uint32_t",), 2, fminCode)
    twoRegPairwiseScInstX("fminp", "FminpScQX", "SimdFloatCmpOp",
                          ("uint64_t",), 4, fminCode)
    # FMINP (vector)
    threeEqualRegInstX("fminp", "FminpDX", "SimdFloatCmpOp", smallFloatTypes,
                       2, fminCode, pairwise=True)
    threeEqualRegInstX("fminp", "FminpQX", "SimdFloatCmpOp", floatTypes, 4,
                       fminCode, pairwise=True)
    # FMINV
    # Note: SimdFloatCmpOp can be a bit optimistic here
    fminAcrossCode = fpAcrossOp % "Min"
    twoRegAcrossInstX("fminv", "FminvDX", "SimdFloatCmpOp",
                      ("uint16_t", "uint32_t"), 2,
                      fminAcrossCode, False, False, True)
    twoRegAcrossInstX("fminv", "FminvQX", "SimdFloatCmpOp",
                      ("uint16_t", "uint32_t"), 4,
                      fminAcrossCode, False, False, True)
    # FMLA (by element)
    fhm_check = '''
      AA64ISAR0 isar0 = xc->tcBase()->readMiscReg( MISCREG_ID_AA64ISAR0_EL1);
      if (!isar0.fhm)
          return std::make_shared<UndefinedInstruction>(machInst, true);
    '''
    fmlaCode = fpOp % ("fplibMulAdd<Element>("
                       "destElem, srcElem1, srcElem2, fpscr)")
    threeEqualRegInstX("fmla", "FmlaElemDX", "SimdFloatMultAccOp",
                       smallFloatTypes, 2, fmlaCode, True, byElem=True)
    threeEqualRegInstX("fmla", "FmlaElemQX", "SimdFloatMultAccOp", floatTypes,
                       4, fmlaCode, True, byElem=True)
    threeEqualRegInstX("fmla", "FmlaElemScX", "SimdFloatMultAccOp", floatTypes,
                       4, fmlaCode, True, byElem=True, scalar=True)
    # FMLA (vector)
    threeEqualRegInstX("fmla", "FmlaDX", "SimdFloatMultAccOp", smallFloatTypes,
                       2, fmlaCode, True)
    threeEqualRegInstX("fmla", "FmlaQX", "SimdFloatMultAccOp", floatTypes, 4,
                       fmlaCode, True)
    # FMLAL (by element)
    fmlalCode = fpOp % ("fplibMulAddH("
                       "destElem, srcElem1, srcElem2, fpscr)")
    threeRegLongInstX("fmlal", "FmlalElemDX", "SimdFloatMultAccOp",
                      ("uint16_t",), fmlalCode, True, byElem=True, short=True,
                      extra_check=fhm_check)
    threeRegLongInstX("fmlal", "FmlalElemQX", "SimdFloatMultAccOp",
                      ("uint16_t",), fmlalCode, True, byElem=True,
                      extra_check=fhm_check)
    # FMLAL2 (by element)
    threeRegLongInstX("fmlal2", "Fmlal2ElemDX", "SimdFloatMultAccOp",
                      ("uint16_t",), fmlalCode, True, byElem=True, hi=True,
                      short=True, extra_check=fhm_check)
    threeRegLongInstX("fmlal2", "Fmlal2ElemQX", "SimdFloatMultAccOp",
                      ("uint16_t",), fmlalCode, True, byElem=True, hi=True,
                      extra_check=fhm_check)
    # FMLAL (vector)
    threeRegLongInstX("fmlal", "FmlalDX", "SimdFloatMultAccOp",
                      ("uint16_t",), fmlalCode, True, short=True,
                      extra_check=fhm_check)
    threeRegLongInstX("fmlal", "FmlalQX", "SimdFloatMultAccOp",
                      ("uint16_t",), fmlalCode, True,
                      extra_check=fhm_check)
    # FMLAL2 (vector)
    threeRegLongInstX("fmlal2", "Fmlal2DX", "SimdFloatMultAccOp",
                      ("uint16_t",), fmlalCode, True, hi=True, short=True,
                      extra_check=fhm_check)
    threeRegLongInstX("fmlal2", "Fmlal2QX", "SimdFloatMultAccOp",
                      ("uint16_t",), fmlalCode, True, hi=True,
                      extra_check=fhm_check)
    # FMLS (by element)
    fmlsCode = fpOp % ("fplibMulAdd<Element>(destElem,"
                       " fplibNeg<Element>(srcElem1), srcElem2, fpscr)")
    threeEqualRegInstX("fmls", "FmlsElemDX", "SimdFloatMultAccOp",
                       smallFloatTypes, 2, fmlsCode, True, byElem=True)
    threeEqualRegInstX("fmls", "FmlsElemQX", "SimdFloatMultAccOp", floatTypes,
                       4, fmlsCode, True, byElem=True)
    threeEqualRegInstX("fmls", "FmlsElemScX", "SimdFloatMultAccOp", floatTypes,
                       4, fmlsCode, True, byElem=True, scalar=True)
    # FMLS (vector)
    threeEqualRegInstX("fmls", "FmlsDX", "SimdFloatMultAccOp", smallFloatTypes,
                       2, fmlsCode, True)
    threeEqualRegInstX("fmls", "FmlsQX", "SimdFloatMultAccOp", floatTypes, 4,
                       fmlsCode, True)
    # FMLSL (by element)
    fmlslCode = fpOp % ("fplibMulAddH(destElem, "
                       "fplibNeg<Element>(srcElem1), srcElem2, fpscr)")
    threeRegLongInstX("fmlsl", "FmlslElemDX", "SimdFloatMultAccOp",
                      ("uint16_t",), fmlslCode, True, byElem=True, short=True,
                      extra_check=fhm_check)
    threeRegLongInstX("fmlsl", "FmlslElemQX", "SimdFloatMultAccOp",
                      ("uint16_t",), fmlslCode, True, byElem=True,
                      extra_check=fhm_check)
    # FMLSL2 (by element)
    threeRegLongInstX("fmlsl2", "Fmlsl2ElemDX", "SimdFloatMultAccOp",
                      ("uint16_t",), fmlslCode, True, byElem=True, hi=True,
                      short=True, extra_check=fhm_check)
    threeRegLongInstX("fmlsl2", "Fmlsl2ElemQX", "SimdFloatMultAccOp",
                      ("uint16_t",), fmlslCode, True, byElem=True, hi=True,
                      extra_check=fhm_check)
    # FMLSL (vector)
    threeRegLongInstX("fmlsl", "FmlslDX", "SimdFloatMultAccOp",
                      ("uint16_t",), fmlslCode, True, short=True,
                      extra_check=fhm_check)
    threeRegLongInstX("fmlsl", "FmlslQX", "SimdFloatMultAccOp",
                      ("uint16_t",), fmlslCode, True, extra_check=fhm_check)
    # FMLSL2 (vector)
    threeRegLongInstX("fmlsl2", "Fmlsl2DX", "SimdFloatMultAccOp",
                      ("uint16_t",), fmlslCode, True, hi=True, short=True,
                      extra_check=fhm_check)
    threeRegLongInstX("fmlsl2", "Fmlsl2QX", "SimdFloatMultAccOp",
                      ("uint16_t",), fmlslCode, True, hi=True,
                      extra_check=fhm_check)
    # FMOV
    fmovCode = 'destElem = imm;'
    oneRegImmInstX("fmov", "FmovDX", "SimdMiscOp", smallFloatTypes, 2,
                   fmovCode)
    oneRegImmInstX("fmov", "FmovQX", "SimdMiscOp", floatTypes, 4, fmovCode)
    # FMUL (by element)
    fmulCode = fpBinOp % "Mul"
    threeEqualRegInstX("fmul", "FmulElemDX", "SimdFloatMultOp",
                       smallFloatTypes, 2, fmulCode, byElem=True)
    threeEqualRegInstX("fmul", "FmulElemQX", "SimdFloatMultOp", floatTypes, 4,
                       fmulCode, byElem=True)
    threeEqualRegInstX("fmul", "FmulElemScX", "SimdFloatMultOp", floatTypes, 4,
                       fmulCode, byElem=True, scalar=True)
    # FMUL (vector)
    threeEqualRegInstX("fmul", "FmulDX", "SimdFloatMultOp", smallFloatTypes, 2,
                       fmulCode)
    threeEqualRegInstX("fmul", "FmulQX", "SimdFloatMultOp", floatTypes, 4,
                       fmulCode)
    # FMULX
    fmulxCode = fpBinOp % "MulX"
    threeEqualRegInstX("fmulx", "FmulxDX", "SimdFloatMultOp", smallFloatTypes,
                       2, fmulxCode)
    threeEqualRegInstX("fmulx", "FmulxQX", "SimdFloatMultOp", floatTypes, 4,
                       fmulxCode)
    threeEqualRegInstX("fmulx", "FmulxScX", "SimdFloatMultOp", floatTypes, 4,
                       fmulxCode, scalar=True)
    # FMULX (by element)
    threeEqualRegInstX("fmulx", "FmulxElemDX", "SimdFloatMultOp",
                       smallFloatTypes, 2, fmulxCode, byElem=True)
    threeEqualRegInstX("fmulx", "FmulxElemQX", "SimdFloatMultOp", floatTypes,
                       4, fmulxCode, byElem=True)
    threeEqualRegInstX("fmulx", "FmulxElemScX", "SimdFloatMultOp", floatTypes,
                       4, fmulxCode, byElem=True, scalar=True)
    # FNEG
    fnegCode = fpOp % "fplibNeg<Element>(srcElem1)"
    twoEqualRegInstX("Neg", "FnegDX", "SimdFloatAluOp", smallFloatTypes, 2,
                     fnegCode)
    twoEqualRegInstX("Neg", "FnegQX", "SimdFloatAluOp", floatTypes, 4,
                     fnegCode)
    # FRECPE
    frecpeCode = fpOp % "fplibRecipEstimate<Element>(srcElem1, fpscr)"
    twoEqualRegInstX("frecpe", "FrecpeDX", "SimdFloatMultAccOp",
                     smallFloatTypes, 2, frecpeCode)
    twoEqualRegInstX("frecpe", "FrecpeQX", "SimdFloatMultAccOp", floatTypes, 4,
                     frecpeCode)
    twoEqualRegInstX("frecpe", "FrecpeScX", "SimdFloatMultAccOp", floatTypes,
                     4, frecpeCode, scalar=True)
    # FRECPS
    frecpsCode = fpBinOp % "RecipStepFused"
    threeEqualRegInstX("frecps", "FrecpsDX", "SimdFloatMultAccOp",
                       smallFloatTypes, 2, frecpsCode)
    threeEqualRegInstX("frecps", "FrecpsQX", "SimdFloatMultAccOp", floatTypes,
                       4, frecpsCode)
    threeEqualRegInstX("frecps", "FrecpsScX", "SimdFloatMultAccOp", floatTypes,
                       4, frecpsCode, scalar=True)
    # FRECPX
    frecpxCode = fpOp % "fplibRecpX<Element>(srcElem1, fpscr)"
    twoEqualRegInstX("frecpx", "FrecpxX", "SimdFloatMultAccOp", floatTypes, 4,
                     frecpxCode, scalar=True)
    # FRINTA
    frintCode = fpOp % "fplibRoundInt<Element>(srcElem1, %s, %s, fpscr)"
    frintaCode = frintCode % ("FPRounding_TIEAWAY", "false")
    twoEqualRegInstX("frinta", "FrintaDX", "SimdCvtOp", smallFloatTypes, 2,
                     frintaCode)
    twoEqualRegInstX("frinta", "FrintaQX", "SimdCvtOp", floatTypes, 4,
                     frintaCode)
    # FRINTI
    frintiCode = frintCode % ("FPCRRounding(fpscr)", "false")
    twoEqualRegInstX("frinti", "FrintiDX", "SimdCvtOp", smallFloatTypes, 2,
                     frintiCode)
    twoEqualRegInstX("frinti", "FrintiQX", "SimdCvtOp", floatTypes, 4,
                     frintiCode)
    # FRINTM
    frintmCode = frintCode % ("FPRounding_NEGINF", "false")
    twoEqualRegInstX("frintm", "FrintmDX", "SimdCvtOp", smallFloatTypes, 2,
                     frintmCode)
    twoEqualRegInstX("frintm", "FrintmQX", "SimdCvtOp", floatTypes, 4,
                     frintmCode)
    # FRINTN
    frintnCode = frintCode % ("FPRounding_TIEEVEN", "false")
    twoEqualRegInstX("frintn", "FrintnDX", "SimdCvtOp", smallFloatTypes, 2,
                     frintnCode)
    twoEqualRegInstX("frintn", "FrintnQX", "SimdCvtOp", floatTypes, 4,
                     frintnCode)
    # FRINTP
    frintpCode = frintCode % ("FPRounding_POSINF", "false")
    twoEqualRegInstX("frintp", "FrintpDX", "SimdCvtOp", smallFloatTypes, 2,
                     frintpCode)
    twoEqualRegInstX("frintp", "FrintpQX", "SimdCvtOp", floatTypes, 4,
                     frintpCode)
    # FRINTX
    frintxCode = frintCode % ("FPCRRounding(fpscr)", "true")
    twoEqualRegInstX("frintx", "FrintxDX", "SimdCvtOp", smallFloatTypes, 2,
                     frintxCode)
    twoEqualRegInstX("frintx", "FrintxQX", "SimdCvtOp", floatTypes, 4,
                     frintxCode)
    # FRINTZ
    frintzCode = frintCode % ("FPRounding_ZERO", "false")
    twoEqualRegInstX("frintz", "FrintzDX", "SimdCvtOp", smallFloatTypes, 2,
                     frintzCode)
    twoEqualRegInstX("frintz", "FrintzQX", "SimdCvtOp", floatTypes, 4,
                     frintzCode)
    # FRINT32X
    frintts_check = '''
      AA64ISAR1 isar1 = xc->tcBase()->readMiscReg( MISCREG_ID_AA64ISAR1_EL1);
      if (!isar1.frintts)
          return std::make_shared<UndefinedInstruction>(machInst, true);
    '''
    frint32xCode = fpOp % ("fplibRoundIntN<Element>"
                        "(srcElem1, FPCRRounding(fpscr), true, 32, fpscr)")
    twoEqualRegInstX("frint32x", "Frint32xDX", "SimdCvtOp", ('uint32_t',), 2,
                     frint32xCode, extra_check=frintts_check)
    twoEqualRegInstX("frint32x", "Frint32xQX", "SimdCvtOp",
                     ('uint32_t', 'uint64_t',), 4,
                     frint32xCode, extra_check=frintts_check)
    # FRINT32Z
    frint32zCode = fpOp % ("fplibRoundIntN<Element>"
                        "(srcElem1, FPRounding_ZERO, true, 32, fpscr)")
    twoEqualRegInstX("frint32z", "Frint32zDX", "SimdCvtOp", ('uint32_t',), 2,
                     frint32zCode, extra_check=frintts_check)
    twoEqualRegInstX("frint32z", "Frint32zQX", "SimdCvtOp",
                     ('uint32_t', 'uint64_t',), 4,
                     frint32zCode, extra_check=frintts_check)
    # FRINT64X
    frint64xCode = fpOp % ("fplibRoundIntN<Element>"
                        "(srcElem1, FPCRRounding(fpscr), true, 64, fpscr)")
    twoEqualRegInstX("frint64x", "Frint64xDX", "SimdCvtOp", ('uint32_t',), 2,
                     frint64xCode, extra_check=frintts_check)
    twoEqualRegInstX("frint64x", "Frint64xQX", "SimdCvtOp",
                     ('uint32_t', 'uint64_t',), 4,
                     frint64xCode, extra_check=frintts_check)
    # FRINT64Z
    frint64zCode = fpOp % ("fplibRoundIntN<Element>"
                        "(srcElem1, FPRounding_ZERO, true, 64, fpscr)")
    twoEqualRegInstX("frint64z", "Frint64zDX", "SimdCvtOp", ('uint32_t',), 2,
                     frint64zCode, extra_check=frintts_check)
    twoEqualRegInstX("frint64z", "Frint64zQX", "SimdCvtOp",
                     ('uint32_t', 'uint64_t',), 4,
                     frint64zCode, extra_check=frintts_check)
    # FRSQRTE
    frsqrteCode = fpOp % "fplibRSqrtEstimate<Element>(srcElem1, fpscr)"
    twoEqualRegInstX("frsqrte", "FrsqrteDX", "SimdFloatSqrtOp",
                     smallFloatTypes, 2, frsqrteCode)
    twoEqualRegInstX("frsqrte", "FrsqrteQX", "SimdFloatSqrtOp", floatTypes, 4,
                     frsqrteCode)
    twoEqualRegInstX("frsqrte", "FrsqrteScX", "SimdFloatSqrtOp", floatTypes, 4,
                     frsqrteCode, scalar=True)
    # FRSQRTS
    frsqrtsCode = fpBinOp % "RSqrtStepFused"
    threeEqualRegInstX("frsqrts", "FrsqrtsDX", "SimdFloatMiscOp",
                       smallFloatTypes, 2, frsqrtsCode)
    threeEqualRegInstX("frsqrts", "FrsqrtsQX", "SimdFloatMiscOp", floatTypes,
                       4, frsqrtsCode)
    threeEqualRegInstX("frsqrts", "FrsqrtsScX", "SimdFloatMiscOp", floatTypes,
                       4, frsqrtsCode, scalar=True)
    # FSQRT
    fsqrtCode = fpOp % "fplibSqrt<Element>(srcElem1, fpscr)"
    twoEqualRegInstX("fsqrt", "FsqrtDX", "SimdFloatSqrtOp", smallFloatTypes, 2,
                     fsqrtCode)
    twoEqualRegInstX("fsqrt", "FsqrtQX", "SimdFloatSqrtOp", floatTypes, 4,
                     fsqrtCode)
    # FSUB
    fsubCode = fpBinOp % "Sub"
    threeEqualRegInstX("fsub", "FsubDX", "SimdFloatAddOp", smallFloatTypes, 2,
                       fsubCode)
    threeEqualRegInstX("fsub", "FsubQX", "SimdFloatAddOp", floatTypes, 4,
                       fsubCode)
    # INS (element)
    insFromVecElemInstX("ins", "InsElemX", "SimdMiscOp", unsignedTypes, 4)
    # INS (general register)
    insFromGprInstX("ins", "InsGprWX", "SimdMiscOp", smallUnsignedTypes, 4,
                    'W')
    insFromGprInstX("ins", "InsGprXX", "SimdMiscOp", unsignedTypes, 4, 'X')
    # MLA (by element)
    mlaCode = "destElem += srcElem1 * srcElem2;"
    threeEqualRegInstX("mla", "MlaElemDX", "SimdMultAccOp",
                       ("uint16_t", "uint32_t"), 2, mlaCode, True, byElem=True)
    threeEqualRegInstX("mla", "MlaElemQX", "SimdMultAccOp",
                       ("uint16_t", "uint32_t"), 4, mlaCode, True, byElem=True)
    # MLA (vector)
    threeEqualRegInstX("mla", "MlaDX", "SimdMultAccOp", smallUnsignedTypes, 2,
                       mlaCode, True)
    threeEqualRegInstX("mla", "MlaQX", "SimdMultAccOp", smallUnsignedTypes, 4,
                       mlaCode, True)
    # MLS (by element)
    mlsCode = "destElem -= srcElem1 * srcElem2;"
    threeEqualRegInstX("mls", "MlsElemDX", "SimdMultAccOp",
                       ("uint16_t", "uint32_t"), 2, mlsCode, True, byElem=True)
    threeEqualRegInstX("mls", "MlsElemQX", "SimdMultAccOp",
                       ("uint16_t", "uint32_t"), 4, mlsCode, True, byElem=True)
    # MLS (vector)
    threeEqualRegInstX("mls", "MlsDX", "SimdMultAccOp", smallUnsignedTypes, 2,
                       mlsCode, True)
    threeEqualRegInstX("mls", "MlsQX", "SimdMultAccOp", smallUnsignedTypes, 4,
                       mlsCode, True)
    # MOV (element) -> alias to INS (element)
    # MOV (from general) -> alias to INS (general register)
    # MOV (scalar) -> alias to DUP (element)
    # MOV (to general) -> alias to UMOV
    # MOV (vector) -> alias to ORR (register)
    # MOVI
    movImmCode = "destElem = imm;"
    oneRegImmInstX("movi", "MoviDX", "SimdMiscOp", ("uint64_t",), 2,
                   movImmCode)
    oneRegImmInstX("movi", "MoviQX", "SimdMiscOp", ("uint64_t",), 4,
                   movImmCode)
    # MUL (by element)
    mulCode = "destElem = srcElem1 * srcElem2;"
    threeEqualRegInstX("mul", "MulElemDX", "SimdMultOp",
                       ("uint16_t", "uint32_t"), 2, mulCode, byElem=True)
    threeEqualRegInstX("mul", "MulElemQX", "SimdMultOp",
                       ("uint16_t", "uint32_t"), 4, mulCode, byElem=True)
    # MUL (vector)
    threeEqualRegInstX("mul", "MulDX", "SimdMultOp", smallUnsignedTypes, 2,
                       mulCode)
    threeEqualRegInstX("mul", "MulQX", "SimdMultOp", smallUnsignedTypes, 4,
                       mulCode)
    # MVN
    mvnCode = "destElem = ~srcElem1;"
    twoEqualRegInstX("mvn", "MvnDX", "SimdAluOp", ("uint64_t",), 2, mvnCode)
    twoEqualRegInstX("mvn", "MvnQX", "SimdAluOp", ("uint64_t",), 4, mvnCode)
    # MVNI
    mvniCode = "destElem = ~imm;"
    oneRegImmInstX("mvni", "MvniDX", "SimdAluOp", ("uint64_t",), 2, mvniCode)
    oneRegImmInstX("mvni", "MvniQX", "SimdAluOp", ("uint64_t",), 4, mvniCode)
    # NEG
    negCode = "destElem = -srcElem1;"
    twoEqualRegInstX("neg", "NegDX", "SimdAluOp", signedTypes, 2, negCode)
    twoEqualRegInstX("neg", "NegQX", "SimdAluOp", signedTypes, 4, negCode)
    # NOT -> alias to MVN
    # ORN
    ornCode = "destElem = srcElem1 | ~srcElem2;"
    threeEqualRegInstX("orn", "OrnDX", "SimdAluOp", ("uint64_t",), 2, ornCode)
    threeEqualRegInstX("orn", "OrnQX", "SimdAluOp", ("uint64_t",), 4, ornCode)
    # ORR (immediate)
    orrImmCode = "destElem |= imm;"
    oneRegImmInstX("orr", "OrrImmDX", "SimdAluOp", ("uint64_t",), 2,
                   orrImmCode, True)
    oneRegImmInstX("orr", "OrrImmQX", "SimdAluOp", ("uint64_t",), 4,
                   orrImmCode, True)
    # ORR (register)
    orrCode = "destElem = srcElem1 | srcElem2;"
    threeEqualRegInstX("orr", "OrrDX", "SimdAluOp", ("uint64_t",), 2, orrCode)
    threeEqualRegInstX("orr", "OrrQX", "SimdAluOp", ("uint64_t",), 4, orrCode)
    # PMUL
    pmulCode = '''
            destElem = 0;
            for (unsigned j = 0; j < sizeof(Element) * 8; j++) {
                if (bits(srcElem2, j))
                    destElem ^= srcElem1 << j;
            }
    '''
    threeEqualRegInstX("pmul", "PmulDX", "SimdMultOp", ("uint8_t",), 2,
                       pmulCode)
    threeEqualRegInstX("pmul", "PmulQX", "SimdMultOp", ("uint8_t",), 4,
                       pmulCode)
    # PMULL, PMULL2
    # Note: 64-bit PMULL is not available (Crypto. Extension)
    pmullCode = '''
            destElem = 0;
            for (unsigned j = 0; j < sizeof(Element) * 8; j++) {
                if (bits(srcElem2, j))
                    destElem ^= (BigElement)srcElem1 << j;
            }
    '''
    threeRegLongInstX("pmull", "PmullX", "SimdMultOp", ("uint8_t",), pmullCode)
    threeRegLongInstX("pmull", "Pmull2X", "SimdMultOp", ("uint8_t",),
                      pmullCode, hi=True)
    # RADDHN, RADDHN2
    raddhnCode = '''
            destElem = ((BigElement)srcElem1 + (BigElement)srcElem2 +
                        ((BigElement)1 << (sizeof(Element) * 8 - 1))) >>
                       (sizeof(Element) * 8);
    '''
    threeRegNarrowInstX("raddhn", "RaddhnX", "SimdAddOp", smallUnsignedTypes,
                        raddhnCode)
    threeRegNarrowInstX("raddhn2", "Raddhn2X", "SimdAddOp", smallUnsignedTypes,
                        raddhnCode, hi=True)
    # RBIT
    rbitCode = '''
            destElem = 0;
            Element temp = srcElem1;
            for (int i = 0; i < 8 * sizeof(Element); i++) {
                destElem = destElem  | ((temp & 0x1) <<
                                        (8 * sizeof(Element) - 1 - i));
                temp >>= 1;
            }
    '''
    twoEqualRegInstX("rbit", "RbitDX", "SimdAluOp", ("uint8_t",), 2, rbitCode)
    twoEqualRegInstX("rbit", "RbitQX", "SimdAluOp", ("uint8_t",), 4, rbitCode)
    # REV16
    rev16Code = '''
            destElem = srcElem1;
            unsigned groupSize = ((1 << 1) / sizeof(Element));
            unsigned reverseMask = (groupSize - 1);
            j = i ^ reverseMask;
    '''
    twoEqualRegInstX("rev16", "Rev16DX", "SimdAluOp", ("uint8_t",), 2,
                     rev16Code)
    twoEqualRegInstX("rev16", "Rev16QX", "SimdAluOp", ("uint8_t",), 4,
                     rev16Code)
    # REV32
    rev32Code = '''
            destElem = srcElem1;
            unsigned groupSize = ((1 << 2) / sizeof(Element));
            unsigned reverseMask = (groupSize - 1);
            j = i ^ reverseMask;
    '''
    twoEqualRegInstX("rev32", "Rev32DX", "SimdAluOp", ("uint8_t", "uint16_t"),
                     2, rev32Code)
    twoEqualRegInstX("rev32", "Rev32QX", "SimdAluOp", ("uint8_t", "uint16_t"),
                     4, rev32Code)
    # REV64
    rev64Code = '''
            destElem = srcElem1;
            unsigned groupSize = ((1 << 3) / sizeof(Element));
            unsigned reverseMask = (groupSize - 1);
            j = i ^ reverseMask;
    '''
    twoEqualRegInstX("rev64", "Rev64DX", "SimdAluOp", smallUnsignedTypes, 2,
                     rev64Code)
    twoEqualRegInstX("rev64", "Rev64QX", "SimdAluOp", smallUnsignedTypes, 4,
                     rev64Code)
    # RSHRN, RSHRN2
    rshrnCode = '''
            if (imm > sizeof(srcElem1) * 8) {
                destElem = 0;
            } else if (imm) {
                Element rBit = bits(srcElem1, imm - 1);
                destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit;
            } else {
                destElem = srcElem1;
            }
    '''
    twoRegNarrowInstX("rshrn", "RshrnX", "SimdShiftOp", smallUnsignedTypes,
                      rshrnCode, hasImm=True)
    twoRegNarrowInstX("rshrn2", "Rshrn2X", "SimdShiftOp", smallUnsignedTypes,
                      rshrnCode, hasImm=True, hi=True)
    # RSUBHN, RSUBHN2
    rsubhnCode = '''
            destElem = ((BigElement)srcElem1 - (BigElement)srcElem2 +
                        ((BigElement)1 << (sizeof(Element) * 8 - 1))) >>
                       (sizeof(Element) * 8);
    '''
    threeRegNarrowInstX("rsubhn", "RsubhnX", "SimdAddOp", smallTypes,
                        rsubhnCode)
    threeRegNarrowInstX("rsubhn2", "Rsubhn2X", "SimdAddOp", smallTypes,
                        rsubhnCode, hi=True)
    # SABA
    abaCode = '''
            destElem += (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) :
                                                (srcElem2 - srcElem1);
    '''
    threeEqualRegInstX("saba", "SabaDX", "SimdAddAccOp", smallSignedTypes, 2,
                       abaCode, True)
    threeEqualRegInstX("saba", "SabaQX", "SimdAddAccOp", smallSignedTypes, 4,
                       abaCode, True)
    # SABAL, SABAL2
    abalCode = '''
            destElem += (srcElem1 > srcElem2) ?
                ((BigElement)srcElem1 - (BigElement)srcElem2) :
                ((BigElement)srcElem2 - (BigElement)srcElem1);
    '''
    threeRegLongInstX("sabal", "SabalX", "SimdAddAccOp", smallSignedTypes,
                      abalCode, True)
    threeRegLongInstX("sabal2", "Sabal2X", "SimdAddAccOp", smallSignedTypes,
                      abalCode, True, hi=True)
    # SABD
    abdCode = '''
            destElem = (srcElem1 > srcElem2) ? (srcElem1 - srcElem2) :
                                               (srcElem2 - srcElem1);
    '''
    threeEqualRegInstX("sabd", "SabdDX", "SimdAddOp", smallSignedTypes, 2,
                       abdCode)
    threeEqualRegInstX("sabd", "SabdQX", "SimdAddOp", smallSignedTypes, 4,
                       abdCode)
    # SABDL, SABDL2
    abdlCode = '''
            destElem = (srcElem1 > srcElem2) ?
                ((BigElement)srcElem1 - (BigElement)srcElem2) :
                ((BigElement)srcElem2 - (BigElement)srcElem1);
    '''
    threeRegLongInstX("sabdl", "SabdlX", "SimdAddAccOp", smallSignedTypes,
                      abdlCode, True)
    threeRegLongInstX("sabdl2", "Sabdl2X", "SimdAddAccOp", smallSignedTypes,
                      abdlCode, True, hi=True)
    # SADALP
    adalpCode = "destElem += (BigElement)srcElem1 + (BigElement)srcElem2;"
    twoRegCondenseInstX("sadalp", "SadalpDX", "SimdAddOp", smallSignedTypes, 2,
                        adalpCode, True)
    twoRegCondenseInstX("sadalp", "SadalpQX", "SimdAddOp", smallSignedTypes, 4,
                        adalpCode, True)
    # SADDL, SADDL2
    addlwCode = "destElem = (BigElement)srcElem1 + (BigElement)srcElem2;"
    threeRegLongInstX("saddl", "SaddlX", "SimdAddAccOp", smallSignedTypes,
                      addlwCode)
    threeRegLongInstX("saddl2", "Saddl2X", "SimdAddAccOp", smallSignedTypes,
                      addlwCode, hi=True)
    # SADDLP
    twoRegCondenseInstX("saddlp", "SaddlpDX", "SimdAddOp", smallSignedTypes, 2,
                        addlwCode)
    twoRegCondenseInstX("saddlp", "SaddlpQX", "SimdAddOp", smallSignedTypes, 4,
                        addlwCode)
    # SADDLV
    # Note: SimdAddOp can be a bit optimistic here
    addAcrossLongCode = "destElem += (BigElement)srcElem1;"
    twoRegAcrossInstX("saddlv", "SaddlvDX", "SimdAddOp", ("int8_t", "int16_t"),
                      2, addAcrossLongCode, long=True)
    twoRegAcrossInstX("saddlv", "SaddlvQX", "SimdAddOp", ("int8_t", "int16_t"),
                      4, addAcrossLongCode, long=True)
    twoRegAcrossInstX("saddlv", "SaddlvBQX", "SimdAddOp", ("int32_t",), 4,
                      addAcrossLongCode, doubleDest=True, long=True)
    # SADDW, SADDW2
    threeRegWideInstX("saddw", "SaddwX", "SimdAddAccOp", smallSignedTypes,
                      addlwCode)
    threeRegWideInstX("saddw2", "Saddw2X", "SimdAddAccOp", smallSignedTypes,
                      addlwCode, hi=True)
    # SCVTF (fixed-point)
    scvtfFixedCode = fpOp % ("fplibFixedToFP<Element>("
                             "sext<sizeof(Element) * 8>(srcElem1), imm,"
                             " false, FPCRRounding(fpscr), fpscr)")
    twoEqualRegInstX("scvtf", "ScvtfFixedDX", "SimdCvtOp", smallFloatTypes, 2,
                     scvtfFixedCode, hasImm=True)
    twoEqualRegInstX("scvtf", "ScvtfFixedSQX", "SimdCvtOp", smallFloatTypes, 4,
                     scvtfFixedCode, hasImm=True)
    twoEqualRegInstX("scvtf", "ScvtfFixedDQX", "SimdCvtOp", ("uint64_t",), 4,
                     scvtfFixedCode, hasImm=True)
    twoEqualRegInstX("scvtf", "ScvtfFixedScSX", "SimdCvtOp", smallFloatTypes,
                     4, scvtfFixedCode, hasImm=True, scalar=True)
    twoEqualRegInstX("scvtf", "ScvtfFixedScDX", "SimdCvtOp", ("uint64_t",), 4,
                     scvtfFixedCode, hasImm=True, scalar=True)
    # SCVTF (integer)
    scvtfIntCode = fpOp % ("fplibFixedToFP<Element>("
                           "sext<sizeof(Element) * 8>(srcElem1), 0,"
                           " false, FPCRRounding(fpscr), fpscr)")
    twoEqualRegInstX("scvtf", "ScvtfIntDX", "SimdCvtOp", smallFloatTypes, 2,
                     scvtfIntCode)
    twoEqualRegInstX("scvtf", "ScvtfIntSQX", "SimdCvtOp", smallFloatTypes, 4,
                     scvtfIntCode)
    twoEqualRegInstX("scvtf", "ScvtfIntDQX", "SimdCvtOp", ("uint64_t",), 4,
                     scvtfIntCode)
    twoEqualRegInstX("scvtf", "ScvtfIntScHX", "SimdCvtOp", ("uint16_t",), 4,
                     scvtfIntCode, scalar=True)
    twoEqualRegInstX("scvtf", "ScvtfIntScSX", "SimdCvtOp", ("uint32_t",), 4,
                     scvtfIntCode, scalar=True)
    twoEqualRegInstX("scvtf", "ScvtfIntScDX", "SimdCvtOp", ("uint64_t",), 4,
                     scvtfIntCode, scalar=True)
    # SHADD
    haddCode = '''
            Element carryBit =
                (((unsigned)srcElem1 & 0x1) +
                 ((unsigned)srcElem2 & 0x1)) >> 1;
            // Use division instead of a shift to ensure the sign extension
            // works right. The compiler will figure out if it can be a shift.
            // Mask the inputs so they get truncated correctly.
            destElem = (((srcElem1 & ~(Element)1) / 2) +
                        ((srcElem2 & ~(Element)1) / 2)) + carryBit;
    '''
    threeEqualRegInstX("shadd", "ShaddDX", "SimdAddOp", smallSignedTypes, 2,
                       haddCode)
    threeEqualRegInstX("shadd", "ShaddQX", "SimdAddOp", smallSignedTypes, 4,
                       haddCode)
    # SHL
    shlCode = '''
            if (imm >= sizeof(Element) * 8)
                destElem = (srcElem1 << (sizeof(Element) * 8 - 1)) << 1;
            else
                destElem = srcElem1 << imm;
    '''
    twoEqualRegInstX("shl", "ShlDX", "SimdShiftOp", unsignedTypes, 2, shlCode,
                     hasImm=True)
    twoEqualRegInstX("shl", "ShlQX", "SimdShiftOp", unsignedTypes, 4, shlCode,
                     hasImm=True)
    # SHLL, SHLL2
    shllCode = "destElem = ((BigElement)srcElem1) << (sizeof(Element) * 8);"
    twoRegLongInstX("shll", "ShllX", "SimdShiftOp", smallTypes, shllCode)
    twoRegLongInstX("shll", "Shll2X", "SimdShiftOp", smallTypes, shllCode,
                    hi=True)
    # SHRN, SHRN2
    shrnCode = '''
            if (imm >= sizeof(srcElem1) * 8) {
                destElem = 0;
            } else {
                destElem = srcElem1 >> imm;
            }
    '''
    twoRegNarrowInstX("shrn", "ShrnX", "SimdShiftOp", smallUnsignedTypes,
                      shrnCode, hasImm=True)
    twoRegNarrowInstX("shrn2", "Shrn2X", "SimdShiftOp", smallUnsignedTypes,
                      shrnCode, hasImm=True, hi=True)
    # SHSUB
    hsubCode = '''
            Element borrowBit =
                (((srcElem1 & 0x1) - (srcElem2 & 0x1)) >> 1) & 0x1;
            // Use division instead of a shift to ensure the sign extension
            // works right. The compiler will figure out if it can be a shift.
            // Mask the inputs so they get truncated correctly.
            destElem = (((srcElem1 & ~(Element)1) / 2) -
                        ((srcElem2 & ~(Element)1) / 2)) - borrowBit;
    '''
    threeEqualRegInstX("shsub", "ShsubDX", "SimdAddOp", smallSignedTypes, 2,
                       hsubCode)
    threeEqualRegInstX("shsub", "ShsubQX", "SimdAddOp", smallSignedTypes, 4,
                       hsubCode)
    # SLI
    sliCode = '''
            if (imm >= sizeof(Element) * 8)
                destElem = destElem;
            else
                destElem = (srcElem1 << imm) | (destElem & mask(imm));
    '''
    twoEqualRegInstX("sli", "SliDX", "SimdShiftOp", unsignedTypes, 2, sliCode,
                     True, hasImm=True)
    twoEqualRegInstX("sli", "SliQX", "SimdShiftOp", unsignedTypes, 4, sliCode,
                     True, hasImm=True)
    # SMAX
    maxCode = "destElem = (srcElem1 > srcElem2) ? srcElem1 : srcElem2;"
    threeEqualRegInstX("smax", "SmaxDX", "SimdCmpOp", smallSignedTypes, 2,
                       maxCode)
    threeEqualRegInstX("smax", "SmaxQX", "SimdCmpOp", smallSignedTypes, 4,
                       maxCode)
    # SMAXP
    threeEqualRegInstX("smaxp", "SmaxpDX", "SimdCmpOp", smallSignedTypes, 2,
                       maxCode, pairwise=True)
    threeEqualRegInstX("smaxp", "SmaxpQX", "SimdCmpOp", smallSignedTypes, 4,
                       maxCode, pairwise=True)
    # SMAXV
    maxAcrossCode = '''
            if (i == 0 || srcElem1 > destElem)
                destElem = srcElem1;
    '''
    twoRegAcrossInstX("smaxv", "SmaxvDX", "SimdCmpOp", ("int8_t", "int16_t"),
                      2, maxAcrossCode)
    twoRegAcrossInstX("smaxv", "SmaxvQX", "SimdCmpOp", smallSignedTypes, 4,
                      maxAcrossCode)
    # SMIN
    minCode = "destElem = (srcElem1 < srcElem2) ? srcElem1 : srcElem2;"
    threeEqualRegInstX("smin", "SminDX", "SimdCmpOp", smallSignedTypes, 2,
                       minCode)
    threeEqualRegInstX("smin", "SminQX", "SimdCmpOp", smallSignedTypes, 4,
                       minCode)
    # SMINP
    threeEqualRegInstX("sminp", "SminpDX", "SimdCmpOp", smallSignedTypes, 2,
                       minCode, pairwise=True)
    threeEqualRegInstX("sminp", "SminpQX", "SimdCmpOp", smallSignedTypes, 4,
                       minCode, pairwise=True)
    # SMINV
    minAcrossCode = '''
            if (i == 0 || srcElem1 < destElem)
                destElem = srcElem1;
    '''
    twoRegAcrossInstX("sminv", "SminvDX", "SimdCmpOp", ("int8_t", "int16_t"),
                      2, minAcrossCode)
    twoRegAcrossInstX("sminv", "SminvQX", "SimdCmpOp", smallSignedTypes, 4,
                      minAcrossCode)

    split('exec')

    # SMLAL, SMLAL2 (by element)
    mlalCode = "destElem += (BigElement)srcElem1 * (BigElement)srcElem2;"
    threeRegLongInstX("smlal", "SmlalElemX", "SimdMultAccOp",
                      ("int16_t", "int32_t"), mlalCode, True, byElem=True)
    threeRegLongInstX("smlal", "SmlalElem2X", "SimdMultAccOp",
                      ("int16_t", "int32_t"), mlalCode, True, byElem=True,
                      hi=True)
    # SMLAL, SMLAL2 (vector)
    threeRegLongInstX("smlal", "SmlalX", "SimdMultAccOp", smallSignedTypes,
                      mlalCode, True)
    threeRegLongInstX("smlal", "Smlal2X", "SimdMultAccOp", smallSignedTypes,
                      mlalCode, True, hi=True)
    # SMLSL, SMLSL2 (by element)
    mlslCode = "destElem -= (BigElement)srcElem1 * (BigElement)srcElem2;"
    threeRegLongInstX("smlsl", "SmlslElemX", "SimdMultAccOp", smallSignedTypes,
                      mlslCode, True, byElem=True)
    threeRegLongInstX("smlsl", "SmlslElem2X", "SimdMultAccOp",
                      smallSignedTypes, mlslCode, True, byElem=True, hi=True)
    # SMLSL, SMLSL2 (vector)
    threeRegLongInstX("smlsl", "SmlslX", "SimdMultAccOp", smallSignedTypes,
                      mlslCode, True)
    threeRegLongInstX("smlsl", "Smlsl2X", "SimdMultAccOp", smallSignedTypes,
                      mlslCode, True, hi=True)
    # SMOV
    insToGprInstX("smov", "SmovWX", "SimdMiscOp", ("int8_t", "int16_t"), 4,
                  'W', True)
    insToGprInstX("smov", "SmovXX", "SimdMiscOp", smallSignedTypes, 4, 'X',
                  True)
    # SMULL, SMULL2 (by element)
    mullCode = "destElem = (BigElement)srcElem1 * (BigElement)srcElem2;"
    threeRegLongInstX("smull", "SmullElemX", "SimdMultOp", smallSignedTypes,
                      mullCode, byElem=True)
    threeRegLongInstX("smull", "SmullElem2X", "SimdMultOp", smallSignedTypes,
                      mullCode, byElem=True, hi=True)
    # SMULL, SMULL2 (vector)
    threeRegLongInstX("smull", "SmullX", "SimdMultOp", smallSignedTypes,
                      mullCode)
    threeRegLongInstX("smull", "Smull2X", "SimdMultOp", smallSignedTypes,
                      mullCode, hi=True)
    # SQABS
    sqabsCode = '''
        FPSCR fpscr = (FPSCR) FpscrQc;
        if (srcElem1 == (Element)(std::numeric_limits<Element>::min())) {
            fpscr.qc = 1;
            destElem = ~srcElem1;
        } else if (srcElem1 < 0) {
            destElem = -srcElem1;
        } else {
            destElem = srcElem1;
        }
        FpscrQc = fpscr;
    '''
    twoEqualRegInstX("sqabs", "SqabsDX", "SimdAluOp", smallSignedTypes, 2,
                     sqabsCode)
    twoEqualRegInstX("sqabs", "SqabsQX", "SimdAluOp", signedTypes, 4,
                     sqabsCode)
    twoEqualRegInstX("sqabs", "SqabsScX", "SimdAluOp", signedTypes, 4,
                     sqabsCode, scalar=True)
    # SQADD
    sqaddCode = '''
            destElem = srcElem1 + srcElem2;
            FPSCR fpscr = (FPSCR) FpscrQc;
            bool negDest = (destElem < 0);
            bool negSrc1 = (srcElem1 < 0);
            bool negSrc2 = (srcElem2 < 0);
            if ((negDest != negSrc1) && (negSrc1 == negSrc2)) {
                destElem = std::numeric_limits<Element>::min();
                if (negDest)
                    destElem -= 1;
                fpscr.qc = 1;
            }
            FpscrQc = fpscr;
    '''
    threeEqualRegInstX("sqadd", "SqaddDX", "SimdAddOp", smallSignedTypes, 2,
                       sqaddCode)
    threeEqualRegInstX("sqadd", "SqaddQX", "SimdAddOp", signedTypes, 4,
                       sqaddCode)
    threeEqualRegInstX("sqadd", "SqaddScX", "SimdAddOp", signedTypes, 4,
                       sqaddCode, scalar=True)
    # SQDMLAL, SQDMLAL2 (by element)
    qdmlalCode = '''
        FPSCR fpscr = (FPSCR) FpscrQc;
        BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
        Element maxNeg = std::numeric_limits<Element>::min();
        Element halfNeg = maxNeg / 2;
        if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
            (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
            (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
            midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8));
            fpscr.qc = 1;
        }
        bool negPreDest = ltz(destElem);
        destElem += midElem;
        bool negDest = ltz(destElem);
        bool negMid = ltz(midElem);
        if (negPreDest == negMid && negMid != negDest) {
            destElem = mask(sizeof(BigElement) * 8 - 1);
            if (negPreDest)
                destElem = ~destElem;
            fpscr.qc = 1;
        }
        FpscrQc = fpscr;
    '''
    threeRegLongInstX("sqdmlal", "SqdmlalElemX", "SimdMultAccOp",
                      ("int16_t", "int32_t"), qdmlalCode, True, byElem=True)
    threeRegLongInstX("sqdmlal", "SqdmlalElem2X", "SimdMultAccOp",
                      ("int16_t", "int32_t"), qdmlalCode, True, byElem=True,
                      hi=True)
    threeRegLongInstX("sqdmlal", "SqdmlalElemScX", "SimdMultAccOp",
                      ("int16_t", "int32_t"), qdmlalCode, True, byElem=True,
                      scalar=True)
    # SQDMLAL, SQDMLAL2 (vector)
    threeRegLongInstX("sqdmlal", "SqdmlalX", "SimdMultAccOp",
                      ("int16_t", "int32_t"), qdmlalCode, True)
    threeRegLongInstX("sqdmlal", "Sqdmlal2X", "SimdMultAccOp",
                      ("int16_t", "int32_t"), qdmlalCode, True, hi=True)
    threeRegLongInstX("sqdmlal", "SqdmlalScX", "SimdMultAccOp",
                      ("int16_t", "int32_t"), qdmlalCode, True, scalar=True)
    # SQDMLSL, SQDMLSL2 (by element)
    qdmlslCode = '''
        FPSCR fpscr = (FPSCR) FpscrQc;
        BigElement midElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
        Element maxNeg = std::numeric_limits<Element>::min();
        Element halfNeg = maxNeg / 2;
        if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
            (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
            (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
            midElem = ~((BigElement)maxNeg << (sizeof(Element) * 8));
            fpscr.qc = 1;
        }
        bool negPreDest = ltz(destElem);
        destElem -= midElem;
        bool negDest = ltz(destElem);
        bool posMid = ltz((BigElement)-midElem);
        if (negPreDest == posMid && posMid != negDest) {
            destElem = mask(sizeof(BigElement) * 8 - 1);
            if (negPreDest)
                destElem = ~destElem;
            fpscr.qc = 1;
        }
        FpscrQc = fpscr;
    '''
    threeRegLongInstX("sqdmlsl", "SqdmlslElemX", "SimdMultAccOp",
                      ("int16_t", "int32_t"), qdmlslCode, True, byElem=True)
    threeRegLongInstX("sqdmlsl", "SqdmlslElem2X", "SimdMultAccOp",
                      ("int16_t", "int32_t"), qdmlslCode, True, byElem=True,
                      hi=True)
    threeRegLongInstX("sqdmlsl", "SqdmlslElemScX", "SimdMultAccOp",
                      ("int16_t", "int32_t"), qdmlslCode, True, byElem=True,
                      scalar=True)
    # SQDMLSL, SQDMLSL2 (vector)
    threeRegLongInstX("sqdmlsl", "SqdmlslX", "SimdMultAccOp",
                      ("int16_t", "int32_t"), qdmlslCode, True)
    threeRegLongInstX("sqdmlsl", "Sqdmlsl2X", "SimdMultAccOp",
                      ("int16_t", "int32_t"), qdmlslCode, True, hi=True)
    threeRegLongInstX("sqdmlsl", "SqdmlslScX", "SimdMultAccOp",
                      ("int16_t", "int32_t"), qdmlslCode, True, scalar=True)
    # SQDMULH (by element)
    sqdmulhCode = '''
            FPSCR fpscr = (FPSCR) FpscrQc;
            destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2) >>
                       (sizeof(Element) * 8);
            if (srcElem1 == srcElem2 &&
                    srcElem1 == (Element)((Element)1 <<
                        (sizeof(Element) * 8 - 1))) {
                destElem = ~srcElem1;
                fpscr.qc = 1;
            }
            FpscrQc = fpscr;
    '''
    threeEqualRegInstX("sqdmulh", "SqdmulhElemDX", "SimdMultOp",
                       ("int16_t", "int32_t"), 2, sqdmulhCode, byElem=True)
    threeEqualRegInstX("sqdmulh", "SqdmulhElemQX", "SimdMultOp",
                       ("int16_t", "int32_t"), 4, sqdmulhCode, byElem=True)
    threeEqualRegInstX("sqdmulh", "SqdmulhElemScX", "SimdMultOp",
                       ("int16_t", "int32_t"), 4, sqdmulhCode, byElem=True,
                       scalar=True)
    # SQDMULH (vector)
    threeEqualRegInstX("sqdmulh", "SqdmulhDX", "SimdMultOp",
                       ("int16_t", "int32_t"), 2, sqdmulhCode)
    threeEqualRegInstX("sqdmulh", "SqdmulhQX", "SimdMultOp",
                       ("int16_t", "int32_t"), 4, sqdmulhCode)
    threeEqualRegInstX("sqdmulh", "SqdmulhScX", "SimdMultOp",
                       ("int16_t", "int32_t"), 4, sqdmulhCode, scalar=True)
    # SQDMULL, SQDMULL2 (by element)
    qdmullCode = '''
        FPSCR fpscr = (FPSCR) FpscrQc;
        destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2);
        if (srcElem1 == srcElem2 &&
                srcElem1 == (Element)((Element)1 <<
                    (Element)(sizeof(Element) * 8 - 1))) {
            destElem = ~((BigElement)srcElem1 << (sizeof(Element) * 8));
            fpscr.qc = 1;
        }
        FpscrQc = fpscr;
    '''
    threeRegLongInstX("sqdmull", "SqdmullElemX", "SimdMultOp",
                      ("int16_t", "int32_t"), qdmullCode, True, byElem=True)
    threeRegLongInstX("sqdmull", "SqdmullElem2X", "SimdMultOp",
                      ("int16_t", "int32_t"), qdmullCode, True, byElem=True,
                      hi=True)
    threeRegLongInstX("sqdmull", "SqdmullElemScX", "SimdMultOp",
                      ("int16_t", "int32_t"), qdmullCode, True, byElem=True,
                      scalar=True)
    # SQDMULL, SQDMULL2 (vector)
    threeRegLongInstX("sqdmull", "SqdmullX", "SimdMultOp",
                      ("int16_t", "int32_t"), qdmullCode, True)
    threeRegLongInstX("sqdmull", "Sqdmull2X", "SimdMultOp",
                      ("int16_t", "int32_t"), qdmullCode, True, hi=True)
    threeRegLongInstX("sqdmull", "SqdmullScX", "SimdMultOp",
                      ("int16_t", "int32_t"), qdmullCode, True, scalar=True)
    # SQNEG
    sqnegCode = '''
        FPSCR fpscr = (FPSCR) FpscrQc;
        if (srcElem1 == (Element)(std::numeric_limits<Element>::min())) {
            fpscr.qc = 1;
            destElem = ~srcElem1;
        } else {
            destElem = -srcElem1;
        }
        FpscrQc = fpscr;
    '''
    twoEqualRegInstX("sqneg", "SqnegDX", "SimdAluOp", smallSignedTypes, 2,
                     sqnegCode)
    twoEqualRegInstX("sqneg", "SqnegQX", "SimdAluOp", signedTypes, 4,
                     sqnegCode)
    twoEqualRegInstX("sqneg", "SqnegScX", "SimdAluOp", signedTypes, 4,
                     sqnegCode, scalar=True)
    sqrdmCode = '''

          FPSCR fpscr = (FPSCR) FpscrQc;
          int nbits = sizeof(Element)*8;

          auto val_max = std::numeric_limits<Element>::max();
          auto val_min = std::numeric_limits<Element>::min();
          BigElement unsat_value = ((BigElement)destElem << nbits) %(code)s
                ((BigElement)srcElem1 * (BigElement)srcElem2 * 2) +
                ((BigElement)1 << (nbits - 1));
          unsat_value >>= nbits;

          if (unsat_value > val_max) {
              fpscr.qc = 1;
              destElem = val_max;
          } else if (unsat_value < val_min) {
              fpscr.qc = 1;
              destElem = val_min;
          } else {
              destElem = unsat_value;
          }
          FpscrQc = fpscr;
    '''
    code_add = "+"
    sqrdmlahCode = sqrdmCode % {'code': code_add}
    rdm_check = '''
      int sz = bits(machInst, 23, 22);
      AA64ISAR0 isar0 = xc->tcBase()->readMiscReg( MISCREG_ID_AA64ISAR0_EL1);
      if (!isar0.rdm || sz == 3 || sz == 0)
          return std::make_shared<UndefinedInstruction>(machInst, true);
      typedef __int128_t BigElement;
    '''
    threeEqualRegInstX("sqrdmlah", "SqrdmlahElemDX", "SimdMultOp",
                       ("int16_t", "int32_t"), 2, sqrdmlahCode, byElem=True,
                       readDest=True, extra=rdm_check)
    threeEqualRegInstX("sqrdmlah", "SqrdmlahElemQX", "SimdMultOp",
                       ("int16_t", "int32_t"), 4, sqrdmlahCode, byElem=True,
                       readDest=True, extra=rdm_check)
    threeEqualRegInstX("sqrdmlah", "SqrdmlahElemScX", "SimdMultOp",
                       ("int16_t", "int32_t"), 4, sqrdmlahCode, byElem=True,
                       readDest=True, scalar=True, extra=rdm_check)
    # SQRDMLAH (vector)
    threeEqualRegInstX("sqrdmlah", "SqrdmlahDX", "SimdMultOp",
                       ("int16_t", "int32_t"), 2, sqrdmlahCode,
                       readDest=True, extra=rdm_check)
    threeEqualRegInstX("sqrdmlah", "SqrdmlahQX", "SimdMultOp",
                       ("int16_t", "int32_t"), 4, sqrdmlahCode,
                       readDest=True, extra=rdm_check)
    threeEqualRegInstX("sqrdmlah", "SqrdmlahScX", "SimdMultOp",
                       ("int16_t", "int32_t"), 4, sqrdmlahCode, scalar=True,
                       readDest=True, extra=rdm_check)
    # SQRDMLSH (by element)
    code_sub = "-"
    sqrdmlshCode = sqrdmCode % {'code': code_sub}

    threeEqualRegInstX("sqrdmlsh", "SqrdmlshElemDX", "SimdMultOp",
                       ("int16_t", "int32_t"), 2, sqrdmlshCode, byElem=True,
                       readDest=True, extra=rdm_check)
    threeEqualRegInstX("sqrdmlsh", "SqrdmlshElemQX", "SimdMultOp",
                       ("int16_t", "int32_t"), 4, sqrdmlshCode, byElem=True,
                       readDest=True, extra=rdm_check)
    threeEqualRegInstX("sqrdmlsh", "SqrdmlshElemScX", "SimdMultOp",
                       ("int16_t", "int32_t"), 4, sqrdmlshCode, byElem=True,
                       readDest=True, scalar=True, extra=rdm_check)
    # SQRDMLSH (vector)
    threeEqualRegInstX("sqrdmlsh", "SqrdmlshDX", "SimdMultOp",
                       ("int16_t", "int32_t"), 2, sqrdmlshCode,
                       readDest=True, extra=rdm_check)
    threeEqualRegInstX("sqrdmlsh", "SqrdmlshQX", "SimdMultOp",
                       ("int16_t", "int32_t"), 4, sqrdmlshCode,
                       readDest=True, extra=rdm_check)
    threeEqualRegInstX("sqrdmlsh", "SqrdmlshScX", "SimdMultOp",
                       ("int16_t", "int32_t"), 4, sqrdmlshCode, scalar=True,
                       readDest=True, extra=rdm_check)
    # SQRDMULby element)
    sqrdmulhCode = '''
            FPSCR fpscr = (FPSCR) FpscrQc;
            destElem = (2 * (int64_t)srcElem1 * (int64_t)srcElem2 +
                        ((int64_t)1 << (sizeof(Element) * 8 - 1))) >>
                       (sizeof(Element) * 8);
            Element maxNeg = std::numeric_limits<Element>::min();
            Element halfNeg = maxNeg / 2;
            if ((srcElem1 == maxNeg && srcElem2 == maxNeg) ||
                (srcElem1 == halfNeg && srcElem2 == maxNeg) ||
                (srcElem1 == maxNeg && srcElem2 == halfNeg)) {
                if (destElem < 0) {
                    destElem = mask(sizeof(Element) * 8 - 1);
                } else {
                    destElem = std::numeric_limits<Element>::min();
                }
                fpscr.qc = 1;
            }
            FpscrQc = fpscr;
    '''
    threeEqualRegInstX("sqrdmulh", "SqrdmulhElemDX", "SimdMultOp",
                       ("int16_t", "int32_t"), 2, sqrdmulhCode, byElem=True)
    threeEqualRegInstX("sqrdmulh", "SqrdmulhElemQX", "SimdMultOp",
                       ("int16_t", "int32_t"), 4, sqrdmulhCode, byElem=True)
    threeEqualRegInstX("sqrdmulh", "SqrdmulhElemScX", "SimdMultOp",
                       ("int16_t", "int32_t"), 4, sqrdmulhCode, byElem=True,
                       scalar=True)
    # SQRDMULH (vector)
    threeEqualRegInstX("sqrdmulh", "SqrdmulhDX", "SimdMultOp",
                       ("int16_t", "int32_t"), 2, sqrdmulhCode)
    threeEqualRegInstX("sqrdmulh", "SqrdmulhQX", "SimdMultOp",
                       ("int16_t", "int32_t"), 4, sqrdmulhCode)
    threeEqualRegInstX("sqrdmulh", "SqrdmulhScX", "SimdMultOp",
                       ("int16_t", "int32_t"), 4, sqrdmulhCode, scalar=True)
    # SQRSHL
    sqrshlCode = '''
            int16_t shiftAmt = (int8_t)srcElem2;
            FPSCR fpscr = (FPSCR) FpscrQc;
            if (shiftAmt < 0) {
                shiftAmt = -shiftAmt;
                Element rBit = 0;
                if (shiftAmt <= sizeof(Element) * 8)
                    rBit = bits(srcElem1, shiftAmt - 1);
                if (shiftAmt > sizeof(Element) * 8 && srcElem1 < 0)
                    rBit = 1;
                if (shiftAmt >= sizeof(Element) * 8) {
                    shiftAmt = sizeof(Element) * 8 - 1;
                    destElem = 0;
                } else {
                    destElem = (srcElem1 >> shiftAmt);
                }
                // Make sure the right shift sign extended when it should.
                if (srcElem1 < 0 && destElem >= 0) {
                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
                                                 1 - shiftAmt));
                }
                destElem += rBit;
            } else if (shiftAmt > 0) {
                bool sat = false;
                if (shiftAmt >= sizeof(Element) * 8) {
                    if (srcElem1 != 0)
                        sat = true;
                    else
                        destElem = 0;
                } else {
                    if (bits((uint64_t) srcElem1, sizeof(Element) * 8 - 1,
                                sizeof(Element) * 8 - 1 - shiftAmt) !=
                            ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) {
                        sat = true;
                    } else {
                        destElem = srcElem1 << shiftAmt;
                    }
                }
                if (sat) {
                    fpscr.qc = 1;
                    destElem = mask(sizeof(Element) * 8 - 1);
                    if (srcElem1 < 0)
                        destElem = ~destElem;
                }
            } else {
                destElem = srcElem1;
            }
            FpscrQc = fpscr;
    '''
    threeEqualRegInstX("sqrshl", "SqrshlDX", "SimdCmpOp", smallSignedTypes, 2,
                       sqrshlCode)
    threeEqualRegInstX("sqrshl", "SqrshlQX", "SimdCmpOp", signedTypes, 4,
                       sqrshlCode)
    threeEqualRegInstX("sqrshl", "SqrshlScX", "SimdCmpOp", signedTypes, 4,
                       sqrshlCode, scalar=True)
    # SQRSHRN, SQRSHRN2
    sqrshrnCode = '''
            FPSCR fpscr = (FPSCR) FpscrQc;
            if (imm > sizeof(srcElem1) * 8) {
                if (srcElem1 != 0 && srcElem1 != -1)
                    fpscr.qc = 1;
                destElem = 0;
            } else if (imm) {
                BigElement mid = (srcElem1 >> (imm - 1));
                uint64_t rBit = mid & 0x1;
                mid >>= 1;
                mid |= -(mid & ((BigElement)1 <<
                            (sizeof(BigElement) * 8 - 1 - imm)));
                mid += rBit;
                if (mid != (Element)mid) {
                    destElem = mask(sizeof(Element) * 8 - 1);
                    if (srcElem1 < 0)
                        destElem = ~destElem;
                    fpscr.qc = 1;
                } else {
                    destElem = mid;
                }
            } else {
                if (srcElem1 != (Element)srcElem1) {
                    destElem = mask(sizeof(Element) * 8 - 1);
                    if (srcElem1 < 0)
                        destElem = ~destElem;
                    fpscr.qc = 1;
                } else {
                    destElem = srcElem1;
                }
            }
            FpscrQc = fpscr;
    '''
    twoRegNarrowInstX("sqrshrn", "SqrshrnX", "SimdShiftOp", smallSignedTypes,
                      sqrshrnCode, hasImm=True)
    twoRegNarrowInstX("sqrshrn2", "Sqrshrn2X", "SimdShiftOp", smallSignedTypes,
                      sqrshrnCode, hasImm=True, hi=True)
    twoRegNarrowInstX("sqrshrn", "SqrshrnScX", "SimdShiftOp", smallSignedTypes,
                      sqrshrnCode, hasImm=True, scalar=True)
    # SQRSHRUN, SQRSHRUN2
    sqrshrunCode = '''
            FPSCR fpscr = (FPSCR) FpscrQc;
            if (imm > sizeof(srcElem1) * 8) {
                if (srcElem1 != 0)
                    fpscr.qc = 1;
                destElem = 0;
            } else if (imm) {
                BigElement mid = (srcElem1 >> (imm - 1));
                uint64_t rBit = mid & 0x1;
                mid >>= 1;
                mid |= -(mid & ((BigElement)1 <<
                                (sizeof(BigElement) * 8 - 1 - imm)));
                mid += rBit;
                if (bits(mid, sizeof(BigElement) * 8 - 1,
                              sizeof(Element) * 8) != 0) {
                    if (srcElem1 < 0) {
                        destElem = 0;
                    } else {
                        destElem = mask(sizeof(Element) * 8);
                    }
                    fpscr.qc = 1;
                } else {
                    destElem = mid;
                }
            } else {
                if (srcElem1 < 0) {
                    fpscr.qc = 1;
                    destElem = 0;
                } else {
                    destElem = srcElem1;
                }
            }
            FpscrQc = fpscr;
    '''
    twoRegNarrowInstX("sqrshrun", "SqrshrunX", "SimdShiftOp", smallSignedTypes,
                      sqrshrunCode, hasImm=True)
    twoRegNarrowInstX("sqrshrun", "Sqrshrun2X", "SimdShiftOp",
                      smallSignedTypes, sqrshrunCode, hasImm=True, hi=True)
    twoRegNarrowInstX("sqrshrun", "SqrshrunScX", "SimdShiftOp",
                      smallSignedTypes, sqrshrunCode, hasImm=True, scalar=True)
    # SQSHL (immediate)
    sqshlImmCode = '''
            FPSCR fpscr = (FPSCR) FpscrQc;
            if (imm >= sizeof(Element) * 8) {
                if (srcElem1 != 0) {
                    destElem = std::numeric_limits<Element>::min();
                    if (srcElem1 > 0)
                        destElem = ~destElem;
                    fpscr.qc = 1;
                } else {
                    destElem = 0;
                }
            } else if (imm) {
                destElem = (srcElem1 << imm);
                uint64_t topBits = bits((uint64_t)srcElem1,
                                        sizeof(Element) * 8 - 1,
                                        sizeof(Element) * 8 - 1 - imm);
                if (topBits != 0 && topBits != mask(imm + 1)) {
                    destElem = std::numeric_limits<Element>::min();
                    if (srcElem1 > 0)
                        destElem = ~destElem;
                    fpscr.qc = 1;
                }
            } else {
                destElem = srcElem1;
            }
            FpscrQc = fpscr;
    '''
    twoEqualRegInstX("sqshl", "SqshlImmDX", "SimdAluOp", smallSignedTypes, 2,
                     sqshlImmCode, hasImm=True)
    twoEqualRegInstX("sqshl", "SqshlImmQX", "SimdAluOp", signedTypes, 4,
                     sqshlImmCode, hasImm=True)
    twoEqualRegInstX("sqshl", "SqshlImmScX", "SimdAluOp", signedTypes, 4,
                     sqshlImmCode, hasImm=True, scalar=True)
    # SQSHL (register)
    sqshlCode = '''
            int16_t shiftAmt = (int8_t)srcElem2;
            FPSCR fpscr = (FPSCR) FpscrQc;
            if (shiftAmt < 0) {
                shiftAmt = -shiftAmt;
                if (shiftAmt >= sizeof(Element) * 8) {
                    shiftAmt = sizeof(Element) * 8 - 1;
                    destElem = 0;
                } else {
                    destElem = (srcElem1 >> shiftAmt);
                }
                // Make sure the right shift sign extended when it should.
                if (srcElem1 < 0 && destElem >= 0) {
                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
                                                 1 - shiftAmt));
                }
            } else if (shiftAmt > 0) {
                bool sat = false;
                if (shiftAmt >= sizeof(Element) * 8) {
                    if (srcElem1 != 0)
                        sat = true;
                    else
                        destElem = 0;
                } else {
                    if (bits((uint64_t) srcElem1, sizeof(Element) * 8 - 1,
                                sizeof(Element) * 8 - 1 - shiftAmt) !=
                            ((srcElem1 < 0) ? mask(shiftAmt + 1) : 0)) {
                        sat = true;
                    } else {
                        destElem = srcElem1 << shiftAmt;
                    }
                }
                if (sat) {
                    fpscr.qc = 1;
                    destElem = mask(sizeof(Element) * 8 - 1);
                    if (srcElem1 < 0)
                        destElem = ~destElem;
                }
            } else {
                destElem = srcElem1;
            }
            FpscrQc = fpscr;
    '''
    threeEqualRegInstX("sqshl", "SqshlDX", "SimdAluOp", smallSignedTypes, 2,
                       sqshlCode)
    threeEqualRegInstX("sqshl", "SqshlQX", "SimdAluOp", signedTypes, 4,
                       sqshlCode)
    threeEqualRegInstX("sqshl", "SqshlScX", "SimdAluOp", signedTypes, 4,
                       sqshlCode, scalar=True)
    # SQSHLU
    sqshluCode = '''
            FPSCR fpscr = (FPSCR) FpscrQc;
            if (imm >= sizeof(Element) * 8) {
                if (srcElem1 < 0) {
                    destElem = 0;
                    fpscr.qc = 1;
                } else if (srcElem1 > 0) {
                    destElem = mask(sizeof(Element) * 8);
                    fpscr.qc = 1;
                } else {
                    destElem = 0;
                }
            } else if (imm) {
                destElem = (srcElem1 << imm);
                uint64_t topBits = bits((uint64_t)srcElem1,
                                        sizeof(Element) * 8 - 1,
                                        sizeof(Element) * 8 - imm);
                if (srcElem1 < 0) {
                    destElem = 0;
                    fpscr.qc = 1;
                } else if (topBits != 0) {
                    destElem = mask(sizeof(Element) * 8);
                    fpscr.qc = 1;
                }
            } else {
                if (srcElem1 < 0) {
                    fpscr.qc = 1;
                    destElem = 0;
                } else {
                    destElem = srcElem1;
                }
            }
            FpscrQc = fpscr;
    '''
    twoEqualRegInstX("sqshlu", "SqshluDX", "SimdAluOp", smallSignedTypes, 2,
                     sqshluCode, hasImm=True)
    twoEqualRegInstX("sqshlu", "SqshluQX", "SimdAluOp", signedTypes, 4,
                     sqshluCode, hasImm=True)
    twoEqualRegInstX("sqshlu", "SqshluScX", "SimdAluOp", signedTypes, 4,
                     sqshluCode, hasImm=True, scalar=True)
    # SQSHRN, SQSHRN2
    sqshrnCode = '''
        FPSCR fpscr = (FPSCR) FpscrQc;
        if (imm > sizeof(srcElem1) * 8) {
            if (srcElem1 != 0 && srcElem1 != -1)
                fpscr.qc = 1;
            destElem = 0;
        } else if (imm) {
            BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
            mid |= -(mid & ((BigElement)1 <<
                        (sizeof(BigElement) * 8 - 1 - imm)));
            if (mid != (Element)mid) {
                destElem = mask(sizeof(Element) * 8 - 1);
                if (srcElem1 < 0)
                    destElem = ~destElem;
                fpscr.qc = 1;
            } else {
                destElem = mid;
            }
        } else {
            destElem = srcElem1;
        }
        FpscrQc = fpscr;
    '''
    twoRegNarrowInstX("sqshrn", "SqshrnX", "SimdShiftOp", smallSignedTypes,
                      sqshrnCode, hasImm=True)
    twoRegNarrowInstX("sqshrn2", "Sqshrn2X", "SimdShiftOp", smallSignedTypes,
                      sqshrnCode, hasImm=True, hi=True)
    twoRegNarrowInstX("sqshrn", "SqshrnScX", "SimdShiftOp", smallSignedTypes,
                      sqshrnCode, hasImm=True, scalar=True)
    # SQSHRUN, SQSHRUN2
    sqshrunCode = '''
            FPSCR fpscr = (FPSCR) FpscrQc;
            if (imm > sizeof(srcElem1) * 8) {
                if (srcElem1 != 0)
                    fpscr.qc = 1;
                destElem = 0;
            } else if (imm) {
                BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
                if (bits(mid, sizeof(BigElement) * 8 - 1,
                              sizeof(Element) * 8) != 0) {
                    if (srcElem1 < 0) {
                        destElem = 0;
                    } else {
                        destElem = mask(sizeof(Element) * 8);
                    }
                    fpscr.qc = 1;
                } else {
                    destElem = mid;
                }
            } else {
                destElem = srcElem1;
            }
            FpscrQc = fpscr;
    '''
    twoRegNarrowInstX("sqshrun", "SqshrunX", "SimdShiftOp", smallSignedTypes,
                      sqshrunCode, hasImm=True)
    twoRegNarrowInstX("sqshrun", "Sqshrun2X", "SimdShiftOp", smallSignedTypes,
                      sqshrunCode, hasImm=True, hi=True)
    twoRegNarrowInstX("sqshrun", "SqshrunScX", "SimdShiftOp", smallSignedTypes,
                      sqshrunCode, hasImm=True, scalar=True)
    # SQSUB
    sqsubCode = '''
            destElem = srcElem1 - srcElem2;
            FPSCR fpscr = (FPSCR) FpscrQc;
            bool negDest = (destElem < 0);
            bool negSrc1 = (srcElem1 < 0);
            bool posSrc2 = (srcElem2 >= 0);
            if ((negDest != negSrc1) && (negSrc1 == posSrc2)) {
                destElem = std::numeric_limits<Element>::min();
                if (negDest)
                    destElem -= 1;
                fpscr.qc = 1;
            }
            FpscrQc = fpscr;
    '''
    threeEqualRegInstX("sqsub", "SqsubDX", "SimdAddOp", smallSignedTypes, 2,
                       sqsubCode)
    threeEqualRegInstX("sqsub", "SqsubQX", "SimdAddOp", signedTypes, 4,
                       sqsubCode)
    threeEqualRegInstX("sqsub", "SqsubScX", "SimdAddOp", signedTypes, 4,
                       sqsubCode, scalar=True)
    # SQXTN, SQXTN2
    sqxtnCode = '''
            FPSCR fpscr = (FPSCR) FpscrQc;
            destElem = srcElem1;
            if ((BigElement)destElem != srcElem1) {
                fpscr.qc = 1;
                destElem = mask(sizeof(Element) * 8 - 1);
                if (srcElem1 < 0)
                    destElem = ~destElem;
            }
            FpscrQc = fpscr;
    '''
    twoRegNarrowInstX("sqxtn", "SqxtnX", "SimdMiscOp", smallSignedTypes,
                      sqxtnCode)
    twoRegNarrowInstX("sqxtn", "Sqxtn2X", "SimdMiscOp", smallSignedTypes,
                      sqxtnCode, hi=True)
    twoRegNarrowInstX("sqxtn", "SqxtnScX", "SimdMiscOp", smallSignedTypes,
                      sqxtnCode, scalar=True)
    # SQXTUN, SQXTUN2
    sqxtunCode = '''
            FPSCR fpscr = (FPSCR) FpscrQc;
            destElem = srcElem1;
            if (srcElem1 < 0 ||
                    ((BigElement)destElem & mask(sizeof(Element) * 8))
                     != srcElem1) {
                fpscr.qc = 1;
                destElem = mask(sizeof(Element) * 8);
                if (srcElem1 < 0)
                    destElem = ~destElem;
            }
            FpscrQc = fpscr;
    '''
    twoRegNarrowInstX("sqxtun", "SqxtunX", "SimdMiscOp", smallSignedTypes,
                      sqxtunCode)
    twoRegNarrowInstX("sqxtun", "Sqxtun2X", "SimdMiscOp", smallSignedTypes,
                      sqxtunCode, hi=True)
    twoRegNarrowInstX("sqxtun", "SqxtunScX", "SimdMiscOp", smallSignedTypes,
                      sqxtunCode, scalar=True)
    # SRHADD
    rhaddCode = '''
            Element carryBit =
                (((unsigned)srcElem1 & 0x1) +
                 ((unsigned)srcElem2 & 0x1) + 1) >> 1;
            // Use division instead of a shift to ensure the sign extension
            // works right. The compiler will figure out if it can be a shift.
            // Mask the inputs so they get truncated correctly.
            destElem = (((srcElem1 & ~(Element)1) / 2) +
                        ((srcElem2 & ~(Element)1) / 2)) + carryBit;
    '''
    threeEqualRegInstX("srhadd", "SrhaddDX", "SimdAddOp", smallSignedTypes, 2,
                       rhaddCode)
    threeEqualRegInstX("srhadd", "SrhaddQX", "SimdAddOp", smallSignedTypes, 4,
                       rhaddCode)
    # SRI
    sriCode = '''
            if (imm >= sizeof(Element) * 8)
                destElem = destElem;
            else
                destElem = (srcElem1 >> imm) |
                    (destElem & ~mask(sizeof(Element) * 8 - imm));
    '''
    twoEqualRegInstX("sri", "SriDX", "SimdShiftOp", unsignedTypes, 2, sriCode,
                     True, hasImm=True)
    twoEqualRegInstX("sri", "SriQX", "SimdShiftOp", unsignedTypes, 4, sriCode,
                     True, hasImm=True)
    # SRSHL
    rshlCode = '''
            int16_t shiftAmt = (int8_t)srcElem2;
            if (shiftAmt < 0) {
                shiftAmt = -shiftAmt;
                Element rBit = 0;
                if (shiftAmt <= sizeof(Element) * 8)
                    rBit = bits(srcElem1, shiftAmt - 1);
                if (shiftAmt > sizeof(Element) * 8 && ltz(srcElem1))
                    rBit = 1;
                if (shiftAmt >= sizeof(Element) * 8) {
                    shiftAmt = sizeof(Element) * 8 - 1;
                    destElem = 0;
                } else {
                    destElem = (srcElem1 >> shiftAmt);
                }
                // Make sure the right shift sign extended when it should.
                if (ltz(srcElem1) && !ltz(destElem)) {
                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
                                                 1 - shiftAmt));
                }
                destElem += rBit;
            } else if (shiftAmt > 0) {
                if (shiftAmt >= sizeof(Element) * 8) {
                    destElem = 0;
                } else {
                    destElem = srcElem1 << shiftAmt;
                }
            } else {
                destElem = srcElem1;
            }
    '''
    threeEqualRegInstX("srshl", "SrshlDX", "SimdShiftOp", signedTypes, 2,
                       rshlCode)
    threeEqualRegInstX("srshl", "SrshlQX", "SimdShiftOp", signedTypes, 4,
                       rshlCode)
    # SRSHR
    rshrCode = '''
            if (imm > sizeof(srcElem1) * 8) {
                destElem = 0;
            } else if (imm) {
                Element rBit = bits(srcElem1, imm - 1);
                destElem = ((srcElem1 >> (imm - 1)) >> 1) + rBit;
            } else {
                destElem = srcElem1;
            }
    '''
    twoEqualRegInstX("srshr", "SrshrDX", "SimdShiftOp", signedTypes, 2,
                     rshrCode, hasImm=True)
    twoEqualRegInstX("srshr", "SrshrQX", "SimdShiftOp", signedTypes, 4,
                     rshrCode, hasImm=True)
    # SRSRA
    rsraCode = '''
            if (imm > sizeof(srcElem1) * 8) {
                destElem += 0;
            } else if (imm) {
                Element rBit = bits(srcElem1, imm - 1);
                destElem += ((srcElem1 >> (imm - 1)) >> 1) + rBit;
            } else {
                destElem += srcElem1;
            }
    '''
    twoEqualRegInstX("srsra", "SrsraDX", "SimdShiftOp", signedTypes, 2,
                     rsraCode, True, hasImm=True)
    twoEqualRegInstX("srsra", "SrsraQX", "SimdShiftOp", signedTypes, 4,
                     rsraCode, True, hasImm=True)
    # SSHL
    shlCode = '''
            int16_t shiftAmt = (int8_t)srcElem2;
            if (shiftAmt < 0) {
                shiftAmt = -shiftAmt;
                if (shiftAmt >= sizeof(Element) * 8) {
                    shiftAmt = sizeof(Element) * 8 - 1;
                    destElem = 0;
                } else {
                    destElem = (srcElem1 >> shiftAmt);
                }
                // Make sure the right shift sign extended when it should.
                if (ltz(srcElem1) && !ltz(destElem)) {
                    destElem |= -((Element)1 << (sizeof(Element) * 8 -
                                                 1 - shiftAmt));
                }
            } else {
                if (shiftAmt >= sizeof(Element) * 8) {
                    destElem = 0;
                } else {
                    destElem = srcElem1 << shiftAmt;
                }
            }
    '''
    threeEqualRegInstX("sshl", "SshlDX", "SimdShiftOp", signedTypes, 2,
                       shlCode)
    threeEqualRegInstX("sshl", "SshlQX", "SimdShiftOp", signedTypes, 4,
                       shlCode)
    # SSHLL, SSHLL2
    shllCode = '''
            if (imm >= sizeof(destElem) * 8) {
                destElem = 0;
            } else {
                destElem = (BigElement)srcElem1 << imm;
            }
    '''
    twoRegLongInstX("sshll", "SshllX", "SimdShiftOp", smallSignedTypes,
                    shllCode, hasImm=True)
    twoRegLongInstX("sshll", "Sshll2X", "SimdShiftOp", smallSignedTypes,
                    shllCode, hasImm=True, hi=True)
    # SSHR
    shrCode = '''
            if (imm >= sizeof(srcElem1) * 8) {
                if (ltz(srcElem1))
                    destElem = -1;
                else
                    destElem = 0;
            } else {
                destElem = srcElem1 >> imm;
            }
    '''
    twoEqualRegInstX("sshr", "SshrDX", "SimdShiftOp", signedTypes, 2, shrCode,
                     hasImm=True)
    twoEqualRegInstX("sshr", "SshrQX", "SimdShiftOp", signedTypes, 4, shrCode,
                     hasImm=True)
    # SSRA
    sraCode = '''
            Element mid;;
            if (imm >= sizeof(srcElem1) * 8) {
                mid = ltz(srcElem1) ? -1 : 0;
            } else {
                mid = srcElem1 >> imm;
                if (ltz(srcElem1) && !ltz(mid)) {
                    mid |= -(mid & ((Element)1 <<
                                    (sizeof(Element) * 8 - 1 - imm)));
                }
            }
            destElem += mid;
    '''
    twoEqualRegInstX("ssra", "SsraDX", "SimdShiftOp", signedTypes, 2, sraCode,
                     True, hasImm=True)
    twoEqualRegInstX("ssra", "SsraQX", "SimdShiftOp", signedTypes, 4, sraCode,
                     True, hasImm=True)
    # SSUBL
    sublwCode = "destElem = (BigElement)srcElem1 - (BigElement)srcElem2;"
    threeRegLongInstX("ssubl", "SsublX", "SimdAddOp", smallSignedTypes,
                      sublwCode)
    threeRegLongInstX("ssubl2", "Ssubl2X", "SimdAddOp", smallSignedTypes,
                      sublwCode, hi=True)
    # SSUBW
    threeRegWideInstX("ssubw", "SsubwX", "SimdAddOp", smallSignedTypes,
                      sublwCode)
    threeRegWideInstX("ssubw2", "Ssubw2X", "SimdAddOp", smallSignedTypes,
                      sublwCode, hi=True)
    # SUB
    subCode = "destElem = srcElem1 - srcElem2;"
    threeEqualRegInstX("sub", "SubDX", "SimdAddOp", unsignedTypes, 2, subCode)
    threeEqualRegInstX("sub", "SubQX", "SimdAddOp", unsignedTypes, 4, subCode)
    # SUBHN, SUBHN2
    subhnCode = '''
            destElem = ((BigElement)srcElem1 - (BigElement)srcElem2) >>
                        (sizeof(Element) * 8);
    '''
    threeRegNarrowInstX("subhn", "SubhnX", "SimdAddOp", smallUnsignedTypes,
                        subhnCode)
    threeRegNarrowInstX("subhn2", "Subhn2X", "SimdAddOp", smallUnsignedTypes,
                        subhnCode, hi=True)
    # SUQADD
    suqaddCode = '''
            FPSCR fpscr = (FPSCR) FpscrQc;
            Element tmp = destElem + srcElem1;
            if (bits(destElem, sizeof(Element) * 8 - 1) == 0) {
                if (bits(tmp, sizeof(Element) * 8 - 1) == 1 ||
                        tmp < srcElem1 || tmp < destElem) {
                    destElem = (((Element) 1) << (sizeof(Element) * 8 - 1))
                               - 1;
                    fpscr.qc = 1;
                } else {
                    destElem = tmp;
                }
            } else {
                Element absDestElem = (~destElem) + 1;
                if (absDestElem < srcElem1) {
                    // Still check for positive sat., no need to check for
                    // negative sat.
                    if (bits(tmp, sizeof(Element) * 8 - 1) == 1) {
                        destElem = (((Element) 1) << (sizeof(Element) * 8 - 1))
                                   - 1;
                        fpscr.qc = 1;
                    } else {
                        destElem = tmp;
                    }
                } else {
                    destElem = tmp;
                }
            }
            FpscrQc = fpscr;
    '''
    twoEqualRegInstX("suqadd", "SuqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
                     suqaddCode, True)
    twoEqualRegInstX("suqadd", "SuqaddQX", "SimdAddOp", unsignedTypes, 4,
                     suqaddCode, True)
    twoEqualRegInstX("suqadd", "SuqaddScX", "SimdAddOp", unsignedTypes, 4,
                     suqaddCode, True, scalar=True)
    # SXTL -> alias to SSHLL
    # TBL
    tbxTblInstX("tbl", "Tbl1DX", "SimdMiscOp", ("uint8_t",), 1, "true", 2)
    tbxTblInstX("tbl", "Tbl1QX", "SimdMiscOp", ("uint8_t",), 1, "true", 4)
    tbxTblInstX("tbl", "Tbl2DX", "SimdMiscOp", ("uint8_t",), 2, "true", 2)
    tbxTblInstX("tbl", "Tbl2QX", "SimdMiscOp", ("uint8_t",), 2, "true", 4)
    tbxTblInstX("tbl", "Tbl3DX", "SimdMiscOp", ("uint8_t",), 3, "true", 2)
    tbxTblInstX("tbl", "Tbl3QX", "SimdMiscOp", ("uint8_t",), 3, "true", 4)
    tbxTblInstX("tbl", "Tbl4DX", "SimdMiscOp", ("uint8_t",), 4, "true", 2)
    tbxTblInstX("tbl", "Tbl4QX", "SimdMiscOp", ("uint8_t",), 4, "true", 4)
    # TBX
    tbxTblInstX("tbx", "Tbx1DX", "SimdMiscOp", ("uint8_t",), 1, "false", 2)
    tbxTblInstX("tbx", "Tbx1QX", "SimdMiscOp", ("uint8_t",), 1, "false", 4)
    tbxTblInstX("tbx", "Tbx2DX", "SimdMiscOp", ("uint8_t",), 2, "false", 2)
    tbxTblInstX("tbx", "Tbx2QX", "SimdMiscOp", ("uint8_t",), 2, "false", 4)
    tbxTblInstX("tbx", "Tbx3DX", "SimdMiscOp", ("uint8_t",), 3, "false", 2)
    tbxTblInstX("tbx", "Tbx3QX", "SimdMiscOp", ("uint8_t",), 3, "false", 4)
    tbxTblInstX("tbx", "Tbx4DX", "SimdMiscOp", ("uint8_t",), 4, "false", 2)
    tbxTblInstX("tbx", "Tbx4QX", "SimdMiscOp", ("uint8_t",), 4, "false", 4)
    # TRN1
    trnCode = '''
        unsigned part = %s;
        for (unsigned i = 0; i < eCount / 2; i++) {
            destReg.elements[2 * i] = srcReg1.elements[2 * i + part];
            destReg.elements[2 * i + 1] = srcReg2.elements[2 * i + part];
        }
    '''
    threeRegScrambleInstX("trn1", "Trn1DX", "SimdAluOp", smallUnsignedTypes, 2,
                          trnCode % "0")
    threeRegScrambleInstX("trn1", "Trn1QX", "SimdAluOp", unsignedTypes, 4,
                          trnCode % "0")
    # TRN2
    threeRegScrambleInstX("trn2", "Trn2DX", "SimdAluOp", smallUnsignedTypes, 2,
                          trnCode % "1")
    threeRegScrambleInstX("trn2", "Trn2QX", "SimdAluOp", unsignedTypes, 4,
                          trnCode % "1")
    # UABA
    threeEqualRegInstX("uaba", "UabaDX", "SimdAddAccOp", smallUnsignedTypes, 2,
                       abaCode, True)
    threeEqualRegInstX("uaba", "UabaQX", "SimdAddAccOp", smallUnsignedTypes, 4,
                       abaCode, True)
    # UABAL, UABAL2
    threeRegLongInstX("uabal", "UabalX", "SimdAddAccOp", smallUnsignedTypes,
                      abalCode, True)
    threeRegLongInstX("uabal2", "Uabal2X", "SimdAddAccOp", smallUnsignedTypes,
                      abalCode, True, hi=True)
    # UABD
    threeEqualRegInstX("uabd", "UabdDX", "SimdAddOp", smallUnsignedTypes, 2,
                       abdCode)
    threeEqualRegInstX("uabd", "UabdQX", "SimdAddOp", smallUnsignedTypes, 4,
                       abdCode)
    # UABDL, UABDL2
    threeRegLongInstX("uabdl", "UabdlX", "SimdAddAccOp", smallUnsignedTypes,
                      abdlCode, True)
    threeRegLongInstX("uabdl2", "Uabdl2X", "SimdAddAccOp", smallUnsignedTypes,
                      abdlCode, True, hi=True)
    # UADALP
    twoRegCondenseInstX("uadalp", "UadalpDX", "SimdAddOp", smallUnsignedTypes,
                        2, adalpCode, True)
    twoRegCondenseInstX("uadalp", "UadalpQX", "SimdAddOp", smallUnsignedTypes,
                        4, adalpCode, True)
    # UADDL, UADDL2
    threeRegLongInstX("uaddl", "UaddlX", "SimdAddAccOp", smallUnsignedTypes,
                      addlwCode)
    threeRegLongInstX("uaddl2", "Uaddl2X", "SimdAddAccOp", smallUnsignedTypes,
                      addlwCode, hi=True)
    # UADDLP
    twoRegCondenseInstX("uaddlp", "UaddlpDX", "SimdAddOp", smallUnsignedTypes,
                        2, addlwCode)
    twoRegCondenseInstX("uaddlp", "UaddlpQX", "SimdAddOp", smallUnsignedTypes,
                        4, addlwCode)
    # UADDLV
    twoRegAcrossInstX("uaddlv", "UaddlvDX", "SimdAddOp",
                      ("uint8_t", "uint16_t"), 2, addAcrossLongCode, long=True)
    twoRegAcrossInstX("uaddlv", "UaddlvQX", "SimdAddOp",
                      ("uint8_t", "uint16_t"), 4, addAcrossLongCode, long=True)
    twoRegAcrossInstX("uaddlv", "UaddlvBQX", "SimdAddOp", ("uint32_t",), 4,
                      addAcrossLongCode, doubleDest=True, long=True)
    # UADDW
    threeRegWideInstX("uaddw", "UaddwX", "SimdAddAccOp", smallUnsignedTypes,
                      addlwCode)
    threeRegWideInstX("uaddw2", "Uaddw2X", "SimdAddAccOp", smallUnsignedTypes,
                      addlwCode, hi=True)
    # UCVTF (fixed-point)
    ucvtfFixedCode = fpOp % ("fplibFixedToFP<Element>(srcElem1, imm, true,"
                             " FPCRRounding(fpscr), fpscr)")
    twoEqualRegInstX("ucvtf", "UcvtfFixedDX", "SimdCvtOp", smallFloatTypes, 2,
                     ucvtfFixedCode, hasImm=True)
    twoEqualRegInstX("ucvtf", "UcvtfFixedQX", "SimdCvtOp", floatTypes, 4,
                     ucvtfFixedCode, hasImm=True)
    twoEqualRegInstX("ucvtf", "UcvtfFixedScX", "SimdCvtOp", floatTypes, 4,
                     ucvtfFixedCode, hasImm=True, scalar=True)
    # UCVTF (integer)
    ucvtfIntCode = fpOp % ("fplibFixedToFP<Element>(srcElem1, 0, true,"
                           " FPCRRounding(fpscr), fpscr)")
    twoEqualRegInstX("ucvtf", "UcvtfIntDX", "SimdCvtOp", smallFloatTypes, 2,
                     ucvtfIntCode)
    twoEqualRegInstX("ucvtf", "UcvtfIntQX", "SimdCvtOp", floatTypes, 4,
                     ucvtfIntCode)
    twoEqualRegInstX("ucvtf", "UcvtfIntScX", "SimdCvtOp", floatTypes, 4,
                     ucvtfIntCode, scalar=True)
    # UHADD
    threeEqualRegInstX("uhadd", "UhaddDX", "SimdAddOp", smallUnsignedTypes, 2,
                       haddCode)
    threeEqualRegInstX("uhadd", "UhaddQX", "SimdAddOp", smallUnsignedTypes, 4,
                       haddCode)
    # UHSUB
    threeEqualRegInstX("uhsub", "UhsubDX", "SimdAddOp", smallUnsignedTypes, 2,
                       hsubCode)
    threeEqualRegInstX("uhsub", "UhsubQX", "SimdAddOp", smallUnsignedTypes, 4,
                       hsubCode)
    # UMAX
    threeEqualRegInstX("umax", "UmaxDX", "SimdCmpOp", smallUnsignedTypes, 2,
                       maxCode)
    threeEqualRegInstX("umax", "UmaxQX", "SimdCmpOp", smallUnsignedTypes, 4,
                       maxCode)
    # UMAXP
    threeEqualRegInstX("umaxp", "UmaxpDX", "SimdCmpOp", smallUnsignedTypes, 2,
                       maxCode, pairwise=True)
    threeEqualRegInstX("umaxp", "UmaxpQX", "SimdCmpOp", smallUnsignedTypes, 4,
                       maxCode, pairwise=True)
    # UMAXV
    twoRegAcrossInstX("umaxv", "UmaxvDX", "SimdCmpOp", ("uint8_t", "uint16_t"),
                      2, maxAcrossCode)
    twoRegAcrossInstX("umaxv", "UmaxvQX", "SimdCmpOp", smallUnsignedTypes, 4,
                      maxAcrossCode)
    # UMIN
    threeEqualRegInstX("umin", "UminDX", "SimdCmpOp", smallUnsignedTypes, 2,
                       minCode)
    threeEqualRegInstX("umin", "UminQX", "SimdCmpOp", smallUnsignedTypes, 4,
                       minCode)
    # UMINP
    threeEqualRegInstX("uminp", "UminpDX", "SimdCmpOp", smallUnsignedTypes, 2,
                       minCode, pairwise=True)
    threeEqualRegInstX("uminp", "UminpQX", "SimdCmpOp", smallUnsignedTypes, 4,
                       minCode, pairwise=True)
    # UMINV
    twoRegAcrossInstX("uminv", "UminvDX", "SimdCmpOp", ("uint8_t", "uint16_t"),
                      2, minAcrossCode)
    twoRegAcrossInstX("uminv", "UminvQX", "SimdCmpOp", smallUnsignedTypes, 4,
                      minAcrossCode)
    # UMLAL (by element)
    threeRegLongInstX("umlal", "UmlalElemX", "SimdMultAccOp",
                      smallUnsignedTypes, mlalCode, True, byElem=True)
    threeRegLongInstX("umlal", "UmlalElem2X", "SimdMultAccOp",
                      smallUnsignedTypes, mlalCode, True, byElem=True, hi=True)
    # UMLAL (vector)
    threeRegLongInstX("umlal", "UmlalX", "SimdMultAccOp", smallUnsignedTypes,
                      mlalCode, True)
    threeRegLongInstX("umlal", "Umlal2X", "SimdMultAccOp", smallUnsignedTypes,
                      mlalCode, True, hi=True)
    # UMLSL (by element)
    threeRegLongInstX("umlsl", "UmlslElemX", "SimdMultAccOp",
                      smallUnsignedTypes, mlslCode, True, byElem=True)
    threeRegLongInstX("umlsl", "UmlslElem2X", "SimdMultAccOp",
                      smallUnsignedTypes, mlslCode, True, byElem=True, hi=True)
    # UMLSL (vector)
    threeRegLongInstX("umlsl", "UmlslX", "SimdMultAccOp", smallUnsignedTypes,
                      mlslCode, True)
    threeRegLongInstX("umlsl", "Umlsl2X", "SimdMultAccOp", smallUnsignedTypes,
                      mlslCode, True, hi=True)
    # UMOV
    insToGprInstX("umov", "UmovWX", "SimdMiscOp", smallUnsignedTypes, 4, 'W')
    insToGprInstX("umov", "UmovXX", "SimdMiscOp", ("uint64_t",), 4, 'X')
    # UMULL, UMULL2 (by element)
    threeRegLongInstX("umull", "UmullElemX", "SimdMultOp", smallUnsignedTypes,
                      mullCode, byElem=True)
    threeRegLongInstX("umull", "UmullElem2X", "SimdMultOp", smallUnsignedTypes,
                      mullCode, byElem=True, hi=True)
    # UMULL, UMULL2 (vector)
    threeRegLongInstX("umull", "UmullX", "SimdMultOp", smallUnsignedTypes,
                      mullCode)
    threeRegLongInstX("umull", "Umull2X", "SimdMultOp", smallUnsignedTypes,
                      mullCode, hi=True)
    # UQADD
    uqaddCode = '''
            destElem = srcElem1 + srcElem2;
            FPSCR fpscr = (FPSCR) FpscrQc;
            if (destElem < srcElem1 || destElem < srcElem2) {
                destElem = (Element)(-1);
                fpscr.qc = 1;
            }
            FpscrQc = fpscr;
    '''
    threeEqualRegInstX("uqadd", "UqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
                       uqaddCode)
    threeEqualRegInstX("uqadd", "UqaddQX", "SimdAddOp", unsignedTypes, 4,
                       uqaddCode)
    threeEqualRegInstX("uqadd", "UqaddScX", "SimdAddOp", unsignedTypes, 4,
                       uqaddCode, scalar=True)
    # UQRSHL
    uqrshlCode = '''
            int16_t shiftAmt = (int8_t)srcElem2;
            FPSCR fpscr = (FPSCR) FpscrQc;
            if (shiftAmt < 0) {
                shiftAmt = -shiftAmt;
                Element rBit = 0;
                if (shiftAmt <= sizeof(Element) * 8)
                    rBit = bits(srcElem1, shiftAmt - 1);
                if (shiftAmt >= sizeof(Element) * 8) {
                    shiftAmt = sizeof(Element) * 8 - 1;
                    destElem = 0;
                } else {
                    destElem = (srcElem1 >> shiftAmt);
                }
                destElem += rBit;
            } else if (shiftAmt > 0) {
                if (shiftAmt >= sizeof(Element) * 8) {
                    if (srcElem1 != 0) {
                        destElem = mask(sizeof(Element) * 8);
                        fpscr.qc = 1;
                    } else {
                        destElem = 0;
                    }
                } else {
                    if (shiftAmt != 0 &&
                            bits(srcElem1, sizeof(Element) * 8 - 1,
                                           sizeof(Element) * 8 - shiftAmt)) {
                        destElem = mask(sizeof(Element) * 8);
                        fpscr.qc = 1;
                    } else {
                        destElem = srcElem1 << shiftAmt;
                    }
                }
            } else {
                destElem = srcElem1;
            }
            FpscrQc = fpscr;
    '''
    threeEqualRegInstX("uqrshl", "UqrshlDX", "SimdCmpOp", smallUnsignedTypes,
                       2, uqrshlCode)
    threeEqualRegInstX("uqrshl", "UqrshlQX", "SimdCmpOp", unsignedTypes, 4,
                       uqrshlCode)
    threeEqualRegInstX("uqrshl", "UqrshlScX", "SimdCmpOp", unsignedTypes, 4,
                       uqrshlCode, scalar=True)
    # UQRSHRN
    uqrshrnCode = '''
            FPSCR fpscr = (FPSCR) FpscrQc;
            if (imm > sizeof(srcElem1) * 8) {
                if (srcElem1 != 0)
                    fpscr.qc = 1;
                destElem = 0;
            } else if (imm) {
                BigElement mid = (srcElem1 >> (imm - 1));
                uint64_t rBit = mid & 0x1;
                mid >>= 1;
                mid += rBit;
                if (mid != (Element)mid) {
                    destElem = mask(sizeof(Element) * 8);
                    fpscr.qc = 1;
                } else {
                    destElem = mid;
                }
            } else {
                if (srcElem1 != (Element)srcElem1) {
                    destElem = mask(sizeof(Element) * 8 - 1);
                    fpscr.qc = 1;
                } else {
                    destElem = srcElem1;
                }
            }
            FpscrQc = fpscr;
    '''
    twoRegNarrowInstX("uqrshrn", "UqrshrnX", "SimdShiftOp", smallUnsignedTypes,
                      uqrshrnCode, hasImm=True)
    twoRegNarrowInstX("uqrshrn2", "Uqrshrn2X", "SimdShiftOp",
                      smallUnsignedTypes, uqrshrnCode, hasImm=True, hi=True)
    twoRegNarrowInstX("uqrshrn", "UqrshrnScX", "SimdShiftOp",
                      smallUnsignedTypes, uqrshrnCode, hasImm=True,
                      scalar=True)
    # UQSHL (immediate)
    uqshlImmCode = '''
            FPSCR fpscr = (FPSCR) FpscrQc;
            if (imm >= sizeof(Element) * 8) {
                if (srcElem1 != 0) {
                    destElem = mask(sizeof(Element) * 8);
                    fpscr.qc = 1;
                } else {
                    destElem = 0;
                }
            } else if (imm) {
                destElem = (srcElem1 << imm);
                uint64_t topBits = bits((uint64_t)srcElem1,
                                        sizeof(Element) * 8 - 1,
                                        sizeof(Element) * 8 - imm);
                if (topBits != 0) {
                    destElem = mask(sizeof(Element) * 8);
                    fpscr.qc = 1;
                }
            } else {
                destElem = srcElem1;
            }
            FpscrQc = fpscr;
    '''
    twoEqualRegInstX("uqshl", "UqshlImmDX", "SimdAluOp", smallUnsignedTypes, 2,
                     uqshlImmCode, hasImm=True)
    twoEqualRegInstX("uqshl", "UqshlImmQX", "SimdAluOp", unsignedTypes, 4,
                     uqshlImmCode, hasImm=True)
    twoEqualRegInstX("uqshl", "UqshlImmScX", "SimdAluOp", unsignedTypes, 4,
                     uqshlImmCode, hasImm=True, scalar=True)
    # UQSHL (register)
    uqshlCode = '''
            int16_t shiftAmt = (int8_t)srcElem2;
            FPSCR fpscr = (FPSCR) FpscrQc;
            if (shiftAmt < 0) {
                shiftAmt = -shiftAmt;
                if (shiftAmt >= sizeof(Element) * 8) {
                    shiftAmt = sizeof(Element) * 8 - 1;
                    destElem = 0;
                } else {
                    destElem = (srcElem1 >> shiftAmt);
                }
            } else if (shiftAmt > 0) {
                if (shiftAmt >= sizeof(Element) * 8) {
                    if (srcElem1 != 0) {
                        destElem = mask(sizeof(Element) * 8);
                        fpscr.qc = 1;
                    } else {
                        destElem = 0;
                    }
                } else {
                    if (bits(srcElem1, sizeof(Element) * 8 - 1,
                                sizeof(Element) * 8 - shiftAmt)) {
                        destElem = mask(sizeof(Element) * 8);
                        fpscr.qc = 1;
                    } else {
                        destElem = srcElem1 << shiftAmt;
                    }
                }
            } else {
                destElem = srcElem1;
            }
            FpscrQc = fpscr;
    '''
    threeEqualRegInstX("uqshl", "UqshlDX", "SimdAluOp", smallUnsignedTypes, 2,
                       uqshlCode)
    threeEqualRegInstX("uqshl", "UqshlQX", "SimdAluOp", unsignedTypes, 4,
                       uqshlCode)
    threeEqualRegInstX("uqshl", "UqshlScX", "SimdAluOp", unsignedTypes, 4,
                       uqshlCode, scalar=True)
    # UQSHRN, UQSHRN2
    uqshrnCode = '''
            FPSCR fpscr = (FPSCR) FpscrQc;
            if (imm > sizeof(srcElem1) * 8) {
                if (srcElem1 != 0)
                    fpscr.qc = 1;
                destElem = 0;
            } else if (imm) {
                BigElement mid = ((srcElem1 >> (imm - 1)) >> 1);
                if (mid != (Element)mid) {
                    destElem = mask(sizeof(Element) * 8);
                    fpscr.qc = 1;
                } else {
                    destElem = mid;
                }
            } else {
                destElem = srcElem1;
            }
            FpscrQc = fpscr;
    '''
    twoRegNarrowInstX("uqshrn", "UqshrnX", "SimdShiftOp", smallUnsignedTypes,
                      uqshrnCode, hasImm=True)
    twoRegNarrowInstX("uqshrn2", "Uqshrn2X", "SimdShiftOp", smallUnsignedTypes,
                      uqshrnCode, hasImm=True, hi=True)
    twoRegNarrowInstX("uqshrn", "UqshrnScX", "SimdShiftOp", smallUnsignedTypes,
                      uqshrnCode, hasImm=True, scalar=True)
    # UQSUB
    uqsubCode = '''
            destElem = srcElem1 - srcElem2;
            FPSCR fpscr = (FPSCR) FpscrQc;
            if (destElem > srcElem1) {
                destElem = 0;
                fpscr.qc = 1;
            }
            FpscrQc = fpscr;
    '''
    threeEqualRegInstX("uqsub", "UqsubDX", "SimdAddOp", smallUnsignedTypes, 2,
                       uqsubCode)
    threeEqualRegInstX("uqsub", "UqsubQX", "SimdAddOp", unsignedTypes, 4,
                       uqsubCode)
    threeEqualRegInstX("uqsub", "UqsubScX", "SimdAddOp", unsignedTypes, 4,
                       uqsubCode, scalar=True)
    # UQXTN
    uqxtnCode = '''
            FPSCR fpscr = (FPSCR) FpscrQc;
            destElem = srcElem1;
            if ((BigElement)destElem != srcElem1) {
                fpscr.qc = 1;
                destElem = mask(sizeof(Element) * 8);
            }
            FpscrQc = fpscr;
    '''
    twoRegNarrowInstX("uqxtn", "UqxtnX", "SimdMiscOp", smallUnsignedTypes,
                      uqxtnCode)
    twoRegNarrowInstX("uqxtn", "Uqxtn2X", "SimdMiscOp", smallUnsignedTypes,
                      uqxtnCode, hi=True)
    twoRegNarrowInstX("uqxtn", "UqxtnScX", "SimdMiscOp", smallUnsignedTypes,
                      uqxtnCode, scalar=True)
    # URECPE
    urecpeCode = "destElem = unsignedRecipEstimate(srcElem1);"
    twoEqualRegInstX("urecpe", "UrecpeDX", "SimdMultAccOp", ("uint32_t",), 2,
                     urecpeCode)
    twoEqualRegInstX("urecpe", "UrecpeQX", "SimdMultAccOp", ("uint32_t",), 4,
                     urecpeCode)
    # URHADD
    threeEqualRegInstX("urhadd", "UrhaddDX", "SimdAddOp", smallUnsignedTypes,
                       2, rhaddCode)
    threeEqualRegInstX("urhadd", "UrhaddQX", "SimdAddOp", smallUnsignedTypes,
                       4, rhaddCode)
    # URSHL
    threeEqualRegInstX("urshl", "UrshlDX", "SimdShiftOp", unsignedTypes, 2,
                       rshlCode)
    threeEqualRegInstX("urshl", "UrshlQX", "SimdShiftOp", unsignedTypes, 4,
                       rshlCode)
    # URSHR
    twoEqualRegInstX("urshr", "UrshrDX", "SimdShiftOp", unsignedTypes, 2,
                     rshrCode, hasImm=True)
    twoEqualRegInstX("urshr", "UrshrQX", "SimdShiftOp", unsignedTypes, 4,
                     rshrCode, hasImm=True)
    # URSQRTE
    ursqrteCode = "destElem = unsignedRSqrtEstimate(srcElem1);"
    twoEqualRegInstX("ursqrte", "UrsqrteDX", "SimdSqrtOp", ("uint32_t",), 2,
                     ursqrteCode)
    twoEqualRegInstX("ursqrte", "UrsqrteQX", "SimdSqrtOp", ("uint32_t",), 4,
                     ursqrteCode)
    # URSRA
    twoEqualRegInstX("ursra", "UrsraDX", "SimdShiftOp", unsignedTypes, 2,
                     rsraCode, True, hasImm=True)
    twoEqualRegInstX("ursra", "UrsraQX", "SimdShiftOp", unsignedTypes, 4,
                     rsraCode, True, hasImm=True)
    # USHL
    threeEqualRegInstX("ushl", "UshlDX", "SimdShiftOp", unsignedTypes, 2,
                       shlCode)
    threeEqualRegInstX("ushl", "UshlQX", "SimdShiftOp", unsignedTypes, 4,
                       shlCode)
    # USHLL, USHLL2
    twoRegLongInstX("ushll", "UshllX", "SimdShiftOp", smallUnsignedTypes,
                    shllCode, hasImm=True)
    twoRegLongInstX("ushll", "Ushll2X", "SimdShiftOp", smallUnsignedTypes,
                    shllCode, hi=True, hasImm=True)
    # USHR
    twoEqualRegInstX("ushr", "UshrDX", "SimdShiftOp", unsignedTypes, 2,
                     shrCode, hasImm=True)
    twoEqualRegInstX("ushr", "UshrQX", "SimdShiftOp", unsignedTypes, 4,
                     shrCode, hasImm=True)
    # USQADD
    usqaddCode = '''
            FPSCR fpscr = (FPSCR) FpscrQc;
            Element tmp = destElem + srcElem1;
            if (bits(srcElem1, sizeof(Element) * 8 - 1) == 0) {
                if (tmp < srcElem1 || tmp < destElem) {
                    destElem = (Element)(-1);
                    fpscr.qc = 1;
                } else {
                    destElem = tmp;
                }
            } else {
                Element absSrcElem1 = (~srcElem1) + 1;
                if (absSrcElem1 > destElem) {
                    destElem = 0;
                    fpscr.qc = 1;
                } else {
                    destElem = tmp;
                }
            }
            FpscrQc = fpscr;
    '''
    twoEqualRegInstX("usqadd", "UsqaddDX", "SimdAddOp", smallUnsignedTypes, 2,
                     usqaddCode, True)
    twoEqualRegInstX("usqadd", "UsqaddQX", "SimdAddOp", unsignedTypes, 4,
                     usqaddCode, True)
    twoEqualRegInstX("usqadd", "UsqaddScX", "SimdAddOp", unsignedTypes, 4,
                     usqaddCode, True, scalar=True)
    # USRA
    twoEqualRegInstX("usra", "UsraDX", "SimdShiftOp", unsignedTypes, 2,
                     sraCode, True, hasImm=True)
    twoEqualRegInstX("usra", "UsraQX", "SimdShiftOp", unsignedTypes, 4,
                     sraCode, True, hasImm=True)
    # USUBL
    threeRegLongInstX("usubl", "UsublX", "SimdAddOp", smallUnsignedTypes,
                      sublwCode)
    threeRegLongInstX("usubl2", "Usubl2X", "SimdAddOp", smallUnsignedTypes,
                      sublwCode, hi=True)
    # USUBW
    threeRegWideInstX("usubw", "UsubwX", "SimdAddOp", smallUnsignedTypes,
                      sublwCode)
    threeRegWideInstX("usubw2", "Usubw2X", "SimdAddOp", smallUnsignedTypes,
                      sublwCode, hi=True)
    # UXTL -> alias to USHLL
    # UZP1
    uzpCode = '''
        unsigned part = %s;
        for (unsigned i = 0; i < eCount / 2; i++) {
            destReg.elements[i] = srcReg1.elements[2 * i + part];
            destReg.elements[eCount / 2 + i] = srcReg2.elements[2 * i + part];
        }
    '''
    threeRegScrambleInstX("Uzp1", "Uzp1DX", "SimdAluOp", smallUnsignedTypes, 2,
                          uzpCode % "0")
    threeRegScrambleInstX("Uzp1", "Uzp1QX", "SimdAluOp", unsignedTypes, 4,
                          uzpCode % "0")
    # UZP2
    threeRegScrambleInstX("Uzp2", "Uzp2DX", "SimdAluOp", smallUnsignedTypes, 2,
                          uzpCode % "1")
    threeRegScrambleInstX("Uzp2", "Uzp2QX", "SimdAluOp", unsignedTypes, 4,
                          uzpCode % "1")
    # XTN, XTN2
    xtnCode = "destElem = srcElem1;"
    twoRegNarrowInstX("Xtn", "XtnX", "SimdMiscOp", smallUnsignedTypes, xtnCode)
    twoRegNarrowInstX("Xtn", "Xtn2X", "SimdMiscOp", smallUnsignedTypes,
                      xtnCode, hi=True)
    # ZIP1
    zipCode = '''
        unsigned base = %s;
        for (unsigned i = 0; i < eCount / 2; i++) {
            destReg.elements[2 * i] = srcReg1.elements[base + i];
            destReg.elements[2 * i + 1] = srcReg2.elements[base + i];
        }
    '''
    threeRegScrambleInstX("zip1", "Zip1DX", "SimdAluOp", smallUnsignedTypes, 2,
                          zipCode % "0")
    threeRegScrambleInstX("zip1", "Zip1QX", "SimdAluOp", unsignedTypes, 4,
                          zipCode % "0")
    # ZIP2
    threeRegScrambleInstX("zip2", "Zip2DX", "SimdAluOp", smallUnsignedTypes, 2,
                          zipCode % "eCount / 2")
    threeRegScrambleInstX("zip2", "Zip2QX", "SimdAluOp", unsignedTypes, 4,
                          zipCode % "eCount / 2")

    for decoderFlavor, type_dict in decoders.items():
        header_output += '''
        class %(decoder_flavor)sDecoder
        {
          public:
        ''' % { "decoder_flavor" : decoderFlavor }
        for type,name in type_dict.items():
            header_output += '''
            template<typename Elem> using %(type)s = %(new_name)s<Elem>;''' % {
               "type" : type, "new_name" : name
            }
        header_output += '''
        };'''
}};
